From a62cfc3ccf2d096b28e041f2d8bbbe25f68ad00f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=BB=8B=E6=B6=B5=20Zephyr=20Wang?= Date: Sat, 29 May 2021 18:10:00 +0800 Subject: [PATCH 001/718] Fix typo in common.h --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index ac795937c..d27fc4cb7 100644 --- a/common.h +++ b/common.h @@ -392,7 +392,7 @@ typedef int blasint; #endif /*** -To alloc job_t on heap or statck. +To alloc job_t on heap or stack. please https://github.com/xianyi/OpenBLAS/issues/246 ***/ #if defined(OS_WINDOWS) From 448fe1c42a928f2a090f7ec1a5be9b6e81ccbf0f Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 24 Aug 2022 19:24:01 +0800 Subject: [PATCH 002/718] Test on PolarFire Soc. `make NOFORTRAN=1 CC=gcc` --- TargetList.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TargetList.txt b/TargetList.txt index deef75819..99d603d03 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -118,7 +118,7 @@ Z13 Z14 10.RISC-V 64: -RISCV64_GENERIC +RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) C910V 11.LOONGARCH64: From bef47917bd72f35c151038fee0cf485445476863 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Tue, 15 Nov 2022 00:06:25 -0800 Subject: [PATCH 003/718] Initial version for riscv sifive x280 --- Makefile.install | 7 + Makefile.prebuild | 8 + Makefile.riscv64 | 8 + README.md | 5 + TargetList.txt | 1 + benchmark/Makefile | 6 + common_riscv64.h | 4 + cpuid_riscv64.c | 2 + getarch.c | 12 + kernel/riscv64/KERNEL.x280 | 267 ++++++++ kernel/riscv64/amax_rvv.c | 102 +++ kernel/riscv64/amin_rvv.c | 102 +++ kernel/riscv64/asum_rvv.c | 99 +++ kernel/riscv64/axpby_rvv.c | 171 +++++ kernel/riscv64/axpy_rvv.c | 109 +++ kernel/riscv64/copy_rvv.c | 94 +++ kernel/riscv64/dot_rvv.c | 126 ++++ kernel/riscv64/gemm_beta_rvv.c | 89 +++ kernel/riscv64/gemm_ncopy_2_rvv.c | 92 +++ kernel/riscv64/gemm_ncopy_4_rvv.c | 123 ++++ kernel/riscv64/gemm_ncopy_8_rvv.c | 164 +++++ kernel/riscv64/gemm_ncopy_rvv_v1.c | 76 +++ kernel/riscv64/gemm_tcopy_2_rvv.c | 108 +++ kernel/riscv64/gemm_tcopy_4_rvv.c | 236 +++++++ kernel/riscv64/gemm_tcopy_8_rvv.c | 264 ++++++++ kernel/riscv64/gemm_tcopy_rvv_v1.c | 74 +++ kernel/riscv64/gemmkernel_2x2_rvv.c | 214 ++++++ kernel/riscv64/gemmkernel_4x4_rvv.c | 508 ++++++++++++++ kernel/riscv64/gemmkernel_rvv_v1x8.c | 601 +++++++++++++++++ kernel/riscv64/gemv_n_rvv.c | 94 +++ kernel/riscv64/gemv_t_rvv.c | 119 ++++ kernel/riscv64/iamax_rvv.c | 150 +++++ kernel/riscv64/iamin_rvv.c | 151 +++++ kernel/riscv64/imax_rvv.c | 147 +++++ kernel/riscv64/imin_rvv.c | 147 +++++ kernel/riscv64/izamax_rvv.c | 162 +++++ kernel/riscv64/izamin_rvv.c | 161 +++++ kernel/riscv64/max_rvv.c | 98 +++ kernel/riscv64/min_rvv.c | 98 +++ kernel/riscv64/nrm2_rvv.c | 117 ++++ kernel/riscv64/rot_rvv.c | 149 +++++ kernel/riscv64/scal_rvv.c | 80 +++ kernel/riscv64/sum_rvv.c | 95 +++ kernel/riscv64/swap_rvv.c | 142 ++++ kernel/riscv64/symm_lcopy_rvv_v1.c | 101 +++ kernel/riscv64/symm_ucopy_rvv_v1.c | 100 +++ kernel/riscv64/symv_L_rvv.c | 224 +++++++ kernel/riscv64/symv_U_rvv.c | 221 +++++++ kernel/riscv64/trmm_lncopy_rvv_v1.c | 138 ++++ kernel/riscv64/trmm_ltcopy_rvv_v1.c | 134 ++++ kernel/riscv64/trmm_uncopy_rvv_v1.c | 136 ++++ kernel/riscv64/trmm_utcopy_rvv_v1.c | 133 ++++ kernel/riscv64/trmmkernel_2x2_rvv.c | 342 ++++++++++ kernel/riscv64/trmmkernel_4x4_rvv.c | 881 +++++++++++++++++++++++++ kernel/riscv64/trmmkernel_rvv_v1x8.c | 685 +++++++++++++++++++ kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 847 ++++++++++++++++++++++++ kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 840 +++++++++++++++++++++++ kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 792 ++++++++++++++++++++++ kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 828 +++++++++++++++++++++++ kernel/riscv64/trsm_lncopy_rvv_v1.c | 122 ++++ kernel/riscv64/trsm_ltcopy_rvv_v1.c | 122 ++++ kernel/riscv64/trsm_uncopy_rvv_v1.c | 121 ++++ kernel/riscv64/trsm_utcopy_rvv_v1.c | 123 ++++ kernel/riscv64/zamax_rvv.c | 113 ++++ kernel/riscv64/zamin_rvv.c | 112 ++++ kernel/riscv64/zasum_rvv.c | 108 +++ kernel/riscv64/zaxpby_rvv.c | 151 +++++ kernel/riscv64/zaxpy_rvv.c | 154 +++++ kernel/riscv64/zcopy_rvv.c | 105 +++ kernel/riscv64/zdot_rvv.c | 170 +++++ kernel/riscv64/zgemm_beta_rvv.c | 117 ++++ kernel/riscv64/zgemv_n_rvv.c | 170 +++++ kernel/riscv64/zgemv_t_rvv.c | 172 +++++ kernel/riscv64/znrm2_rvv.c | 122 ++++ kernel/riscv64/zrot_rvv.c | 181 +++++ kernel/riscv64/zscal_rvv.c | 148 +++++ kernel/riscv64/zsum_rvv.c | 97 +++ kernel/riscv64/zswap_rvv.c | 156 +++++ kernel/riscv64/ztrmmkernel_2x2_rvv.c | 596 +++++++++++++++++ param.h | 44 ++ 80 files changed, 15188 insertions(+) create mode 100644 kernel/riscv64/KERNEL.x280 create mode 100644 kernel/riscv64/amax_rvv.c create mode 100644 kernel/riscv64/amin_rvv.c create mode 100644 kernel/riscv64/asum_rvv.c create mode 100644 kernel/riscv64/axpby_rvv.c create mode 100644 kernel/riscv64/axpy_rvv.c create mode 100644 kernel/riscv64/copy_rvv.c create mode 100644 kernel/riscv64/dot_rvv.c create mode 100644 kernel/riscv64/gemm_beta_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_8_rvv.c create mode 100644 kernel/riscv64/gemm_ncopy_rvv_v1.c create mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_8_rvv.c create mode 100644 kernel/riscv64/gemm_tcopy_rvv_v1.c create mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c create mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c create mode 100644 kernel/riscv64/gemmkernel_rvv_v1x8.c create mode 100644 kernel/riscv64/gemv_n_rvv.c create mode 100644 kernel/riscv64/gemv_t_rvv.c create mode 100644 kernel/riscv64/iamax_rvv.c create mode 100644 kernel/riscv64/iamin_rvv.c create mode 100644 kernel/riscv64/imax_rvv.c create mode 100644 kernel/riscv64/imin_rvv.c create mode 100644 kernel/riscv64/izamax_rvv.c create mode 100644 kernel/riscv64/izamin_rvv.c create mode 100644 kernel/riscv64/max_rvv.c create mode 100644 kernel/riscv64/min_rvv.c create mode 100644 kernel/riscv64/nrm2_rvv.c create mode 100644 kernel/riscv64/rot_rvv.c create mode 100644 kernel/riscv64/scal_rvv.c create mode 100644 kernel/riscv64/sum_rvv.c create mode 100644 kernel/riscv64/swap_rvv.c create mode 100644 kernel/riscv64/symm_lcopy_rvv_v1.c create mode 100644 kernel/riscv64/symm_ucopy_rvv_v1.c create mode 100644 kernel/riscv64/symv_L_rvv.c create mode 100644 kernel/riscv64/symv_U_rvv.c create mode 100644 kernel/riscv64/trmm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/trmm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c create mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c create mode 100644 kernel/riscv64/trmmkernel_rvv_v1x8.c create mode 100644 kernel/riscv64/trsm_kernel_LN_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_LT_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_RN_rvv_v1.c create mode 100644 kernel/riscv64/trsm_kernel_RT_rvv_v1.c create mode 100644 kernel/riscv64/trsm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/trsm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/zamax_rvv.c create mode 100644 kernel/riscv64/zamin_rvv.c create mode 100644 kernel/riscv64/zasum_rvv.c create mode 100644 kernel/riscv64/zaxpby_rvv.c create mode 100644 kernel/riscv64/zaxpy_rvv.c create mode 100644 kernel/riscv64/zcopy_rvv.c create mode 100644 kernel/riscv64/zdot_rvv.c create mode 100644 kernel/riscv64/zgemm_beta_rvv.c create mode 100644 kernel/riscv64/zgemv_n_rvv.c create mode 100644 kernel/riscv64/zgemv_t_rvv.c create mode 100644 kernel/riscv64/znrm2_rvv.c create mode 100644 kernel/riscv64/zrot_rvv.c create mode 100644 kernel/riscv64/zscal_rvv.c create mode 100644 kernel/riscv64/zsum_rvv.c create mode 100644 kernel/riscv64/zswap_rvv.c create mode 100644 kernel/riscv64/ztrmmkernel_2x2_rvv.c diff --git a/Makefile.install b/Makefile.install index 87b5bc870..f1adaa271 100644 --- a/Makefile.install +++ b/Makefile.install @@ -8,6 +8,7 @@ PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin +OPENBLAS_RELEASE_DIR := $(PREFIX)/release OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake @@ -38,6 +39,7 @@ install : lib.grd @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" + @-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @@ -202,3 +204,8 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! +#Generating release tar + @echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz + @tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release . + + diff --git a/Makefile.prebuild b/Makefile.prebuild index 0be4f1274..e6a8eab59 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -55,6 +55,14 @@ ifeq ($(TARGET), C910V) TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif +ifeq ($(TARGET), x280) +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -mcpu=sifive-x280 +endif + +ifeq ($(TARGET), RISCV64_GENERIC) +TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce91e03ec..d6eaf552d 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -2,3 +2,11 @@ ifeq ($(CORE), C910V) CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif +ifeq ($(CORE), x280) +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -mllvm --riscv-v-vector-bits-min=512 -mcpu=sifive-x280 -ffast-math +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -static +endif +ifeq ($(CORE), RISCV64_GENERIC) +CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static +endif \ No newline at end of file diff --git a/README.md b/README.md index 6ce85e08e..6ecb46178 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ``` (also known to work on C906) +- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0. + ```sh + make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran + ``` + ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. diff --git a/TargetList.txt b/TargetList.txt index deef75819..6c533361e 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -120,6 +120,7 @@ Z14 10.RISC-V 64: RISCV64_GENERIC C910V +x280 11.LOONGARCH64: LOONGSONGENERIC diff --git a/benchmark/Makefile b/benchmark/Makefile index f2f3b354a..734c83a26 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +# x280 temporary workaround for gfortran +ifeq ($(TARGET), x280) +CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT)) +endif + + ifneq ($(NO_LAPACK), 1) GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ diff --git a/common_riscv64.h b/common_riscv64.h index 7ddbe80a4..221a79901 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -92,6 +92,10 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define SEEK_ADDRESS #if defined(C910V) +#include +#endif + +#if defined(x280) #include #endif diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 894d2b873..5326787e6 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 +#define CPU_x280 2 static char *cpuname[] = { "RISCV64_GENERIC", "C910V" + "x280" }; int detect(void){ diff --git a/getarch.c b/getarch.c index cde5b4e83..0d197285c 100644 --- a/getarch.c +++ b/getarch.c @@ -1677,6 +1677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "c910v" #define CORENAME "C910V" #endif +#endif +#ifdef FORCE_x280 +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "x280" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-Dx280 " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "x280" +#define CORENAME "x280" #else #endif diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 new file mode 100644 index 000000000..2eb60f2b4 --- /dev/null +++ b/kernel/riscv64/KERNEL.x280 @@ -0,0 +1,267 @@ +# ********************************************************************************** +# Copyright (c) 2022, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ********************************************************************************** + +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +CTRMMKERNEL = ztrmmkernel_2x2_rvv.c +ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c + +# SGEMM_UNROLL_N set in params.h +ifeq ($(SGEMM_UNROLL_N), 2) +SGEMMKERNEL = gemmkernel_2x2_rvv.c +SGEMMONCOPY = gemm_ncopy_2_rvv.c +SGEMMOTCOPY = gemm_tcopy_2_rvv.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +STRMMKERNEL = trmmkernel_2x2_rvv.c +else ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMKERNEL = gemmkernel_4x4_rvv.c +SGEMMONCOPY = gemm_ncopy_4_rvv.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +STRMMKERNEL = trmmkernel_4x4_rvv.c +else ifeq ($(SGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +SGEMMKERNEL = gemmkernel_rvv_v1x8.c +SGEMMINCOPY = gemm_ncopy_rvv_v1.c +SGEMMITCOPY = gemm_tcopy_rvv_v1.c +SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMKERNEL = trmmkernel_rvv_v1x8.c + +STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +SSYMMUCOPY_M = symm_ucopy_rvv_v1.c +SSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +# SGEMM_UNROLL_N set in params.h +ifeq ($(DGEMM_UNROLL_N), 2) +DGEMMKERNEL = gemmkernel_2x2_rvv.c +DGEMMONCOPY = gemm_ncopy_2_rvv.c +DGEMMOTCOPY = gemm_tcopy_2_rvv.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +DTRMMKERNEL = trmmkernel_2x2_rvv.c +else ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMKERNEL = gemmkernel_4x4_rvv.c +DGEMMONCOPY = gemm_ncopy_4_rvv.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +DTRMMKERNEL = trmmkernel_4x4_rvv.c +else ifeq ($(DGEMM_UNROLL_N), 8) +# UNROLL_M is VLMAX +DGEMMKERNEL = gemmkernel_rvv_v1x8.c +DGEMMINCOPY = gemm_ncopy_rvv_v1.c +DGEMMITCOPY = gemm_tcopy_rvv_v1.c +DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = trmmkernel_rvv_v1x8.c +DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c +DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c +DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c + +DSYMMUCOPY_M = symm_ucopy_rvv_v1.c +DSYMMLCOPY_M = symm_lcopy_rvv_v1.c +endif + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c +TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c +TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c +TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c new file mode 100644 index 000000000..c9c6e7f73 --- /dev/null +++ b/kernel/riscv64/amax_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c new file mode 100644 index 000000000..370b6c338 --- /dev/null +++ b/kernel/riscv64/amin_rvv.c @@ -0,0 +1,102 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c new file mode 100644 index 000000000..4f711c9be --- /dev/null +++ b/kernel/riscv64/asum_rvv.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asumf = 0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + + FLOAT_V_T vx, vsum; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vsum = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + asumf = VFMVFS_FLOAT_M1(v_res); + return(asumf); +} diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c new file mode 100644 index 000000000..7c35c563d --- /dev/null +++ b/kernel/riscv64/axpby_rvv.c @@ -0,0 +1,171 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + FLOAT_V_T vx, vy; + + if ( n < 0 ) return(0); + + if ( beta == 0.0 ) { + if ( alpha == 0.0 ) { + if (1 == inc_y) { + memset(&y[0], 0, n * sizeof(FLOAT)); + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + size_t vl = VSETVL(n); + vy = VFMVVF_FLOAT(0.0, vl); + for ( ; n > 0; n -= vl, y += vl*stride_y) { + vl = VSETVL(n); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMULVF_FLOAT(vx, alpha, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + + } else { + if ( alpha == 0.0 ) { + if (1 == inc_y) { + for (size_t vl; n > 0; n -= vl, y += vl) { + vl = VSETVL(n); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { + vl = VSETVL(n); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + + } else { + if ((1 == inc_x) && (1 == inc_y)) { + for (size_t vl; n > 0; n -= vl, y += vl) { + vl = VSETVL(n); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + VSEV_FLOAT (y, vy, vl); + } + } else if (1 == inc_x) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } else if (1 == inc_y) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); + VSSEV_FLOAT (y, stride_y, vy, vl); + } + } + } + } + + return(0); +} diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c new file mode 100644 index 000000000..3986f4e21 --- /dev/null +++ b/kernel/riscv64/axpy_rvv.c @@ -0,0 +1,109 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ( n <= 0 ) return(0); + if ( da == 0.0 ) return(0); + + FLOAT_V_T vx, vy; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT (y, vy, vl); + } + + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSEV_FLOAT(y, vy, vl); + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c new file mode 100644 index 000000000..5d5a8bd04 --- /dev/null +++ b/kernel/riscv64/copy_rvv.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if(n < 0) return(0); + + FLOAT_V_T v0; + + if(inc_x == 1 && inc_y == 1) { + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSEV_FLOAT(y, v0, vl); + } + + } else if(inc_x == 1) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + VSSEV_FLOAT(y, stride_y, v0, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c new file mode 100644 index 000000000..60dcac2f5 --- /dev/null +++ b/kernel/riscv64/dot_rvv.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + double dot = 0.0; + + if ( n <= 0 ) return(dot); + + size_t vlmax = vsetvlmax_e64m8(); + vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax); + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vle32_v_f32m4(x, vl); + vfloat32m4_t vy = vle32_v_f32m4(y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vle64_v_f64m8(x, vl); + vfloat64m8_t vy = vle64_v_f64m8(y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + + } else if (1 == inc_x) { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vle32_v_f32m4(x, vl); + vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vle64_v_f64m8(x, vl); + vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } else if (1 == inc_y) { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = vle32_v_f32m4(y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = vle64_v_f64m8(y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = vsetvl_e64m8(n); + +#if !defined(DOUBLE) + vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + + vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); +#else + vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + + vr = vfmacc_vv_f64m8(vr, vx, vy, vl); +#endif + } + } + + vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax); + vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax); + dot = vfmv_f_s_f64m1_f64(vec_sum); + + return(dot); +} diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c new file mode 100644 index 000000000..34d1ea078 --- /dev/null +++ b/kernel/riscv64/gemm_beta_rvv.c @@ -0,0 +1,89 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#endif + +// Optimizes the implementation in ../generic/gemm_beta.c + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc) +{ + BLASLONG chunk; + FLOAT *c_offset; + size_t vl; + FLOAT_V_T vx; + + if (beta == ZERO) { + + vl = VSETVL(m); + vx = VFMVVF_FLOAT(0.0, vl); + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { + vl = VSETVL(chunk); + + vx = VLEV_FLOAT(c_offset, vl); + vx = VFMULVF_FLOAT(vx, beta, vl); + VSEV_FLOAT(c_offset, vx, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c new file mode 100644 index 000000000..5f55bc349 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_2_rvv.c @@ -0,0 +1,92 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEG2_FLOAT vsseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEG2_FLOAT vsseg2e64_v_f64m4 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_2.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset; + FLOAT_V_T v1, v2; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU + + a_offset = a; + b_offset = b; + + for(j = (n >> 1); j > 0; j--) { + + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset, vl); + VSEV_FLOAT(b_offset, v1, vl); + + a_offset += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c new file mode 100644 index 000000000..4d4efe4c9 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_4_rvv.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG4_FLOAT vsseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG4_FLOAT vsseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 2); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c new file mode 100644 index 000000000..525b223c2 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +// Optimizes the implementation in ../generic/gemm_ncopy_8.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset; + + FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + size_t vl; + + //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = (n >> 3); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + v5 = VLEV_FLOAT(a_offset5, vl); + v6 = VLEV_FLOAT(a_offset6, vl); + v7 = VLEV_FLOAT(a_offset7, vl); + v8 = VLEV_FLOAT(a_offset8, vl); + + VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + a_offset5 += vl; + a_offset6 += vl; + a_offset7 += vl; + a_offset8 += vl; + b_offset += vl*8; + } + } + + if (n & 4) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + v3 = VLEV_FLOAT(a_offset3, vl); + v4 = VLEV_FLOAT(a_offset4, vl); + + VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + + a_offset1 += vl; + a_offset2 += vl; + a_offset3 += vl; + a_offset4 += vl; + b_offset += vl*4; + } + } + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + v2 = VLEV_FLOAT(a_offset2, vl); + + VSSEG2_FLOAT(b_offset, v1, v2, vl); + + a_offset1 += vl; + a_offset2 += vl; + b_offset += vl*2; + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for(i = m; i > 0; i -= vl) { + vl = VSETVL(i); + + v1 = VLEV_FLOAT(a_offset1, vl); + + VSEV_FLOAT(b_offset, v1, vl); + + a_offset1 += vl; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c new file mode 100644 index 000000000..2c5230752 --- /dev/null +++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c @@ -0,0 +1,76 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda; + + for(i = m; i > 0; i--) { + v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(b_offset, v0, vl); + + a_offset1++; + b_offset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c new file mode 100644 index 000000000..963e1be69 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_2_rvv.c @@ -0,0 +1,108 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 +#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_tcopy_2.c + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset, *b_offset1, *b_offset2; + FLOAT_V_T v1a, v1b, v2a, v2b; + size_t vl; + + //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU + + a_offset = a; + b_offset = b; + b_offset2 = b + m * (n & ~1); + + for(i = (m >> 1); i > 0; i--) { + + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 4; + + for(j = (n >> 1); j > 0; j -= vl) { + vl = VSETVL(j); + + VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl); + VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl); + + VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl); + + a_offset1 += vl * 2; + a_offset2 += vl * 2; + b_offset1 += vl * m * 2; + } + + if (n & 1) { + *(b_offset2 + 0) = *(a_offset1 + 0); + *(b_offset2 + 1) = *(a_offset2 + 0); + b_offset2 += 2; + } + } + + if (m & 1) { + + for(j = (n >> 1); j > 0; j -= vl) { + vl = VSETVL(j); + + VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl); + + VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl); + + a_offset += vl * 2; + b_offset += vl * m * 2; + } + + if (n & 1){ + *(b_offset2 + 0) = *(a_offset + 0); + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c new file mode 100644 index 000000000..ac9974b24 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_4_rvv.c @@ -0,0 +1,236 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 +#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 +#endif + +// Optimizes the implementation in ../generic/gemm_tcopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) +{ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + for(j = (m >> 2); j > 0; j--) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + for(i = (n >> 2); i > 0; i--) { + v1 = VLEV_FLOAT(a_offset1, 4); + v2 = VLEV_FLOAT(a_offset2, 4); + v3 = VLEV_FLOAT(a_offset3, 4); + v4 = VLEV_FLOAT(a_offset4, 4); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + VSEV_FLOAT(b_offset1, v1, 4); + VSEV_FLOAT(b_offset2+4, v2, 4); + VSEV_FLOAT(b_offset2+8, v3, 4); + VSEV_FLOAT(b_offset2+12, v4, 4); + + b_offset1 += m * 4; + } + + if (n & 2) { + v1 = VLEV_FLOAT(a_offset1, 2); + v2 = VLEV_FLOAT(a_offset2, 2); + v3 = VLEV_FLOAT(a_offset3, 2); + v4 = VLEV_FLOAT(a_offset4, 2); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + VSEV_FLOAT(b_offset2, v1, 2); + VSEV_FLOAT(b_offset2+2, v2, 2); + VSEV_FLOAT(b_offset2+4, v3, 2); + VSEV_FLOAT(b_offset2+6, v4, 2); + + b_offset2 += 8; + } + + if (n & 1) { + v1 = VLEV_FLOAT(a_offset1, 1); + v2 = VLEV_FLOAT(a_offset2, 1); + v3 = VLEV_FLOAT(a_offset3, 1); + v4 = VLEV_FLOAT(a_offset4, 1); + + VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1); + + b_offset3 += 4; + } + + } + +// TODO cleanup + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = ctemp1; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c new file mode 100644 index 000000000..81c1f962b --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -0,0 +1,264 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VLSEV_FLOAT vlse32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VLSEV_FLOAT vlse64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + + // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + for(j = (m >> 3); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 8; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 32; + } + + if (n & 2) { + size_t vl = 8; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 16; + } + + if (n & 1) { + size_t vl = 8; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 8; + } + + } + + if (m & 4) { + + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 4; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 4; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 2) { + size_t vl = 4; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 8; + } + + if (n & 1) { + size_t vl = 4; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 4; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 2; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 4) { + size_t vl = 2; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 2) { + size_t vl = 2; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 4; + } + + if (n & 1) { + size_t vl = 2; + + v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSEV_FLOAT(boffset4, v0, vl); + + aoffset1 += 1; + boffset4 += 2; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 3); i > 0; i--) { + size_t vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 4) { + size_t vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 2) { + size_t vl = 2; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset3, v0, vl); + + aoffset1 += 2; + // boffset3 += 2; + } + + if (n & 1) { + *(boffset4) = *(aoffset1); + // aoffset1 ++; + // boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c new file mode 100644 index 000000000..a291b70b8 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_V_T v0; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl; + + for(i = m; i > 0; i--) { + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset, v0, vl); + + aoffset1 += lda; + boffset += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c new file mode 100644 index 000000000..ec8961ced --- /dev/null +++ b/kernel/riscv64/gemmkernel_2x2_rvv.c @@ -0,0 +1,214 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEG2_FLOAT vlseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEG2_FLOAT vlseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/gemm_kernel_2x2.c + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); + + FLOAT_V_T va0, va1, vb0, vb1; + FLOAT_V_T vres0, vres1, vres2, vres3; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; + FLOAT_V_T_M1 v_z0; + + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + size_t vl; + + for (j = bn/2; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm/2; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl*2; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 2; + C1 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 1; + C1 += 1; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + if(bn & 1) { + C0 = C; + ptrba = ba; + for (i = bm/2; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + + ptrba += vl*2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + + C0 += 1; + } + + bb += (bk<<0); + C += ldc; + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c new file mode 100644 index 000000000..aa58bcc76 --- /dev/null +++ b/kernel/riscv64/gemmkernel_4x4_rvv.c @@ -0,0 +1,508 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m1_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VLSEG2_FLOAT vlseg2e32_v_f32m1 +#define VLSEG4_FLOAT vlseg4e32_v_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m1 +#define VFMACCVF_FLOAT vfmacc_vf_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m1 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m1_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VLSEG2_FLOAT vlseg2e64_v_f64m1 +#define VLSEG4_FLOAT vlseg4e64_v_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m1 +#define VFMACCVF_FLOAT vfmacc_vf_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m1 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/gemm_kernel_2x2.c + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU + + FLOAT_V_T va0, va1, va2, va3; + FLOAT_V_T vb0, vb1, vb2, vb3; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; + FLOAT_V_T_M1 v_z0; + + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + size_t vl; + + for (j = bn/4; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + ptrba = ba; + + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + vres8 = VFMVVF_FLOAT(0.0, vlmax); + vres9 = VFMVVF_FLOAT(0.0, vlmax); + vres10 = VFMVVF_FLOAT(0.0, vlmax); + vres11 = VFMVVF_FLOAT(0.0, vlmax); + vres12 = VFMVVF_FLOAT(0.0, vlmax); + vres13 = VFMVVF_FLOAT(0.0, vlmax); + vres14 = VFMVVF_FLOAT(0.0, vlmax); + vres15 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); + + vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl); + vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl); + vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl); + vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl); + + vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl); + vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl); + vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl); + vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl); + + ptrba += vl*4; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + C1 += 4; + C2 += 4; + C3 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); + + ptrba += vl*2; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + C1 += 2; + C2 += 2; + C3 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); + + ptrba += vl; + ptrbb += vl*4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 1; + C1 += 1; + C2 += 1; + C3 += 1; + } + + bb += (bk<<2); + C += (ldc<<2); + } + + if(bn & 2) { + + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl); + vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl); + + ptrba += vl*4; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + C1 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl*2; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 2; + C1 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl*2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 1; + C1 += 1; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + if(bn & 1) { + C0 = C; + ptrba = ba; + for (i = bm/4; i > 0; i--) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + + ptrba += vl*4; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); + + C0 += 4; + } + + if(bm & 2) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + + ptrba += vl*2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); + + C0 += 2; + } + + if(bm & 1) { + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = bk; k > 0; k -= vl) { + vl = VSETVL(k); + + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); + C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); + + C0 += 1; + } + + bb += (bk<<0); + C += ldc; + } + + return 0; +} diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c new file mode 100644 index 000000000..5cd509f93 --- /dev/null +++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c @@ -0,0 +1,601 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#endif + +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7; + IFLOAT *ptrba,*ptrbb; + + //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + // N:8 + for (j = bn/8; j > 0; j--) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + C4 = C3 + ldc; + C5 = C4 + ldc; + C6 = C5 + ldc; + C7 = C6 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrba += vl; + ptrbb += 8; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VLEV_FLOAT(C4, vl); + va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VLEV_FLOAT(C5, vl); + va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VLEV_FLOAT(C6, vl); + va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VLEV_FLOAT(C7, vl); + va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl); + VSEV_FLOAT(C7, va7, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + + bb += (bk<<3); + C += (ldc<<3); + } + + // N:4 + if (bn & 4) { + C0 = C; + C1 = C0 + ldc; + C2 = C1 + ldc; + C3 = C2 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrba += vl; + ptrbb += 4; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VLEV_FLOAT(C2, vl); + va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VLEV_FLOAT(C3, vl); + va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); + VSEV_FLOAT(C3, va3, vl); + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + + bb += (bk<<2); + C += (ldc<<2); + } + + // N:2 + if (bn & 2) { + C0 = C; + C1 = C0 + ldc; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrba += vl; + ptrbb += 2; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VLEV_FLOAT(C1, vl); + va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); + VSEV_FLOAT(C1, va1, vl); + + C0 += vl; + C1 += vl; + } + + bb += (bk<<1); + C += (ldc<<1); + } + + // N:1 + if (bn & 1) { + C0 = C; + ptrba = ba; + + for (i = bm; i > 0; i -= vl) { + vl = VSETVL(i); + + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); +#if 0 + for (k = bk; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrba += vl; + ptrbb += 1; + } +#else + // Unroll K + for (k = bk/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = bk&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } +#endif + va0 = VLEV_FLOAT(C0, vl); + va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); + VSEV_FLOAT(C0, va0, vl); + + C0 += vl; + } + + bb += (bk); + C += (ldc); + } + + return 0; +} diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c new file mode 100644 index 000000000..9d2dee615 --- /dev/null +++ b/kernel/riscv64/gemv_n_rvv.c @@ -0,0 +1,94 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + if(n < 0) return(0); + + FLOAT *a_ptr, *x_ptr; + BLASLONG i; + FLOAT_V_T va, vy; + + if(inc_y == 1) { + + for (size_t vl; m > 0; m -= vl, y += vl, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLEV_FLOAT(y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSEV_FLOAT(y, vy, vl); + } + + } else { + + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) { + vl = VSETVL(m); + a_ptr = a; + x_ptr = x; + vy = VLSEV_FLOAT(y, stride_y, vl); + for(i = 0; i < n; i++) { + va = VLEV_FLOAT(a_ptr, vl); + vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); + + a_ptr += lda; + x_ptr += inc_x; + } + VSSEV_FLOAT(y, stride_y, vy, vl); + } + + } + return(0); +} diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c new file mode 100644 index 000000000..a80af81b6 --- /dev/null +++ b/kernel/riscv64/gemv_t_rvv.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j; + FLOAT *a_ptr, *x_ptr; + + FLOAT_V_T va, vx, vr; + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + if(inc_x == 1) { + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLEV_FLOAT(x_ptr, vl); + vr = VFMACCVV_FLOAT(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for(i = 0; i < n; i++) { + j = m; + a_ptr = a; + x_ptr = x; + vr = VFMVVF_FLOAT(0, vlmax); + + for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) { + vl = VSETVL(j); + + va = VLEV_FLOAT(a_ptr, vl); + vx = VLSEV_FLOAT(x_ptr, stride_x, vl); + vr = VFMACCVV_FLOAT(vr, va, vx, vl); + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + *y += alpha * VFMVFS_FLOAT_M1(v_res); + y += inc_y; + a += lda; + } + + } + + return(0); +} diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c new file mode 100644 index 000000000..8b33b3bcb --- /dev/null +++ b/kernel/riscv64/iamax_rvv.c @@ -0,0 +1,150 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c new file mode 100644 index 000000000..585b37186 --- /dev/null +++ b/kernel/riscv64/iamin_rvv.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vx = VFABSV_FLOAT(vx, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c new file mode 100644 index 000000000..d84ad968e --- /dev/null +++ b/kernel/riscv64/imax_rvv.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_min; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c new file mode 100644 index 000000000..fb734f6f8 --- /dev/null +++ b/kernel/riscv64/imin_rvv.c @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 +#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#else +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFIRSTM vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 +#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c new file mode 100644 index 000000000..9cb332cbb --- /dev/null +++ b/kernel/riscv64/izamax_rvv.c @@ -0,0 +1,162 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 +#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMAXVV_FLOAT vfmax_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFIRSTM vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT vid_v_u64m4_m +#define VIDV_UINT vid_v_u64m4 +#define VADDVX_MASK_UINT vadd_vx_u64m4_m +#define VADDVX_UINT vadd_vx_u64m4 +#define VMVVX_UINT vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 +#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 +#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMAXVV_FLOAT vfmax_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT vid_v_u32m4_m +#define VIDV_UINT vid_v_u32m4 +#define VADDVX_MASK_UINT vadd_vx_u32m4_m +#define VADDVX_UINT vadd_vx_u32m4 +#define VMVVX_UINT vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 +#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx0, vx1, v_max; + UINT_V_T v_max_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_max_index = VMVVX_UINT(0, vlmax); + v_max = VFMVVF_FLOAT(-1, vlmax); + BLASLONG j=0; + FLOAT maxf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + } + } + else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx0, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + } + + } + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); + max_index = VFIRSTM(mask, vlmax); + + v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + max_index = VMVVXS_UINT(v_max_index); + + return(max_index+1); +} diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c new file mode 100644 index 000000000..69771e5aa --- /dev/null +++ b/kernel/riscv64/izamin_rvv.c @@ -0,0 +1,161 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 +#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMINVV_FLOAT vfmin_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFIRSTM vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT vid_v_u64m4_m +#define VIDV_UINT vid_v_u64m4 +#define VADDVX_MASK_UINT vadd_vx_u64m4_m +#define VADDVX_UINT vadd_vx_u64m4 +#define VMVVX_UINT vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 +#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#else +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 +#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMINVV_FLOAT vfmin_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFIRSTM vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT vid_v_u32m4_m +#define VIDV_UINT vid_v_u32m4 +#define VADDVX_MASK_UINT vadd_vx_u32m4_m +#define VADDVX_UINT vadd_vx_u32m4 +#define VMVVX_UINT vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 +#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#endif + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx0, vx1, v_min; + UINT_V_T v_min_index; + MASK_T mask; + + size_t vlmax = VSETVL_MAX; + v_min_index = VMVVX_UINT(0, vlmax); + v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); + BLASLONG j=0; + FLOAT minf=0.0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vx0 = VFABSV_FLOAT(vx0, vl); + vx1 = VFABSV_FLOAT(vx1, vl); + + vx0 = VFADDVV_FLOAT(vx0, vx1, vl); + + // index where element less than v_min + mask = VMFLTVV_FLOAT(vx0, v_min, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, vl); + } + + } + + FLOAT_V_T_M1 v_res, v_max; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, vlmax); + min_index = VFIRSTM(mask, vlmax); + + v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + min_index = VMVVXS_UINT(v_min_index); + + return(min_index+1); +} diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c new file mode 100644 index 000000000..5b1380d2b --- /dev/null +++ b/kernel/riscv64/max_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T vx, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmax = VFMAXVV_FLOAT(vmax, vx, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c new file mode 100644 index 000000000..bddcc0ba7 --- /dev/null +++ b/kernel/riscv64/min_rvv.c @@ -0,0 +1,98 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T vx, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vmin = VFMINVV_FLOAT(vmin, vx, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c new file mode 100644 index 000000000..3f5d50397 --- /dev/null +++ b/kernel/riscv64/nrm2_rvv.c @@ -0,0 +1,117 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT vfabs_v_f32m8 +#define ABS fabsf +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT vfabs_v_f64m8 +#define ABS fabs +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + + if( n <= 0 ) return(0.0); + if(n == 1) return (ABS(x[0])); + + FLOAT_V_T vr, v0; + FLOAT_V_T_M1 v_max, v_res; + FLOAT scale = 0.0, ssq = 0.0; + + size_t vlmax = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(0, vlmax); + + vr = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v0 = VFABSV_FLOAT(v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl * inc_x) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFABSV_FLOAT(v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + } + + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + + ssq = VFMVFS_FLOAT_M1(v_res); + scale = VFMVFS_FLOAT_M1(v_max); + ssq = ssq / (scale*scale); + + return(scale * sqrt(ssq)); +} diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c new file mode 100644 index 000000000..7bf5e4270 --- /dev/null +++ b/kernel/riscv64/rot_rvv.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + + if(n <= 0) return(0); + + FLOAT_V_T v0, v1, vx, vy; + + if (inc_x == 0 || inc_y == 0) { + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_y == 1) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSEV_FLOAT(y, v1, vl); + } + + } else if(inc_x == 1) { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSEV_FLOAT(x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx, c, vl); + v0 = VFMACCVF_FLOAT(v0, s, vy, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + + v1 = VFMULVF_FLOAT(vx, s, vl); + v1 = VFMSACVF_FLOAT(v1, c, vy, vl); + VSSEV_FLOAT(y, stride_y, v1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c new file mode 100644 index 000000000..d2c0378bf --- /dev/null +++ b/kernel/riscv64/scal_rvv.c @@ -0,0 +1,80 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ( (n <= 0) || (inc_x <= 0)) return(0); + + FLOAT_V_T v0; + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSEV_FLOAT(x, v0, vl); + } + + } else { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + + } + + return 0; +} diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c new file mode 100644 index 000000000..1db0d09dd --- /dev/null +++ b/kernel/riscv64/sum_rvv.c @@ -0,0 +1,95 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + FLOAT_V_T vx, vsum; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vsum = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vsum = VFADDVV_FLOAT(vsum, vx, vl); + } + + } + + v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + sumf = VFMVFS_FLOAT_M1(v_res); + return(sumf); +} diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c new file mode 100644 index 000000000..2cf92f6ad --- /dev/null +++ b/kernel/riscv64/swap_rvv.c @@ -0,0 +1,142 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG stride_x, stride_y; + FLOAT_V_T vx, vy; + + if (n <= 0) return(0); + + if (inc_x == 0 && inc_y == 0) { + if (n & 1) { + FLOAT temp = x[0]; + x[0] = y[0]; + y[0] = temp; + } + else { + return 0; + } + } + else if(inc_x == 0) { + FLOAT temp = x[0]; + x[0] = y[(n - 1) * inc_y]; + FLOAT* ptr = y + (n - 1) * inc_y; // start from the last one + stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) { + vl = VSETVL(m); + vy = VLSEV_FLOAT(ptr - 1, stride_y, vl); + VSSEV_FLOAT(ptr, stride_y, vy, vl); + } + y[0] = temp; + } + else if(inc_y == 0) { + FLOAT temp = y[0]; + y[0] = x[(n - 1) * inc_x]; + FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one + stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) { + vl = VSETVL(m); + vx = VLSEV_FLOAT(ptr - 1, stride_x, vl); + VSSEV_FLOAT(ptr, stride_x, vx, vl); + } + x[0] = temp; + } + else if(inc_x == 1 && inc_y == 1) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else if (inc_y == 1) { + stride_x = inc_x * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLEV_FLOAT(y, vl); + VSEV_FLOAT(y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + + } else if(inc_x == 1) { + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLEV_FLOAT(x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSEV_FLOAT(x, vy, vl); + } + + } else { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { + vl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VLSEV_FLOAT(y, stride_y, vl); + VSSEV_FLOAT(y, stride_y, vx, vl); + VSSEV_FLOAT(x, stride_x, vy, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c new file mode 100644 index 000000000..f0def9617 --- /dev/null +++ b/kernel/riscv64/symm_lcopy_rvv_v1.c @@ -0,0 +1,101 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +// Optimizes the implementation in ../generic/symm_lcopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1, va2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX + posY * lda; + ao2 = a + posY + (posX) * lda; + + for (i = m; i > 0; i--, offset--) { + va2 = VLSEV_FLOAT(ao2, stride_lda, vl); + va1 = VLEV_FLOAT(ao1, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1 += lda; + ao2++; + } + } + + return 0; +} + diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c new file mode 100644 index 000000000..958506df3 --- /dev/null +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -0,0 +1,100 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +// Optimizes the implementation in ../generic/symm_ucopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1, va2; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posX + 0 + posY * lda; + + for (i = m; i > 0; i--, offset--) { + va1 = VLSEV_FLOAT(ao1, stride_lda, vl); + va2 = VLEV_FLOAT(ao2, vl); + + // offset > (0 - vindex) ---> (offset + vindex) > 0 + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + VSEV_FLOAT(b, vb, vl); + + b += vl; + ao1++; + ao2 += lda; + } + } + + return 0; +} diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c new file mode 100644 index 000000000..737abaae3 --- /dev/null +++ b/kernel/riscv64/symv_L_rvv.c @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax = VSETVL_MAX_M1, vl; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + if(inc_x == 1 && inc_y == 1) + { + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = 0; + stride_y = inc_y * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jy += inc_y; + a_ptr += lda; + } + } + else if(inc_y == 1) + { + jx = 0; + stride_x = inc_x * sizeof(FLOAT); + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + inc_xv = inc_x * vl; + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLEV_FLOAT(&y[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[j] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + a_ptr += lda; + } + } + else + { + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + + y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); + jx += inc_x; + jy += inc_y; + a_ptr += lda; + } + } + return(0); +} + diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c new file mode 100644 index 000000000..cb923be5d --- /dev/null +++ b/kernel/riscv64/symv_U_rvv.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 +#define VFMACCVV_FLOAT vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT vfmul_vf_f32m8 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 +#define VFMACCVV_FLOAT vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT vfmul_vf_f64m8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1; + FLOAT *a_ptr = a; + FLOAT_V_T_M1 v_res, v_z0; + size_t vl_max = VSETVL_MAX_M1, vl; + v_res = VFMVVF_FLOAT_M1(0, vl_max); + v_z0 = VFMVVF_FLOAT_M1(0, vl_max); + vl_max = VSETVL_MAX; + + FLOAT_V_T va, vx, vy, vr; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + if(inc_x == 1 && inc_y == 1) + { + a_ptr += m1 * lda; + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + vr = VFMVVF_FLOAT(0, vl); + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + } + } + else if(inc_x == 1) + { + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLEV_FLOAT(&x[i], vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jy += inc_y; + } + } + else if(inc_y == 1) + { + jx = m1 * inc_x; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + vr = VFMVVF_FLOAT(0, vl); + + vy = VLEV_FLOAT(&y[i], vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSEV_FLOAT(&y[i], vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + + ix += inc_xv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + } + } + else + { + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0; k -= vl, i += vl) + { + vl = VSETVL(k); + inc_xv = inc_x * vl; + inc_yv = inc_y * vl; + vr = VFMVVF_FLOAT(0, vl); + vy = VLSEV_FLOAT(&y[iy], stride_y, vl); + va = VLEV_FLOAT(&a_ptr[i], vl); + vy = VFMACCVF_FLOAT(vy, temp1, va, vl); + VSSEV_FLOAT(&y[iy], stride_y, vy, vl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, vl); + vr = VFMACCVV_FLOAT(vr, vx, va, vl); + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + + y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); + a_ptr += lda; + jx += inc_x; + jy += inc_y; + } + } + return(0); +} diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c new file mode 100644 index 000000000..73a8233f8 --- /dev/null +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -0,0 +1,138 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT vid_v_i32m2 +#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T vb, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..2fe8cf79e --- /dev/null +++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY + posX * lda; + } + else + { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) + { + ao ++; + b += vl; + X ++; + i ++; + } + else if (X < posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c new file mode 100644 index 000000000..b64cd840d --- /dev/null +++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda; + FLOAT *ao; + + FLOAT_V_T vb, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + + ao ++; + b += vl; + X ++; + i ++; + } + else if (X > posY) + { + ao += lda; + b += vl; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao++; + b += vl; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c new file mode 100644 index 000000000..b96daae5b --- /dev/null +++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c @@ -0,0 +1,133 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + FLOAT_V_T vb, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX + posY * lda; + } + else + { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) + { + ao ++; + b += vl; + X ++; + i++; + } + else if (X > posY) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + ao += lda; + b += vl; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); +#endif + VSEV_FLOAT(b, vb, vl); + ao += lda; + b += vl; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c new file mode 100644 index 000000000..127e76970 --- /dev/null +++ b/kernel/riscv64/trmmkernel_2x2_rvv.c @@ -0,0 +1,342 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_2x2.c + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + BLASLONG off, temp; + + FLOAT_V_T va0, va1, vb0, vb1; + FLOAT_V_T vres0, vres1, vres2, vres3; + FLOAT_V_T_M1 v_res, v_z0; + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vl; + size_t vlmax = VSETVL_MAX; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j = bn/2; j > 0; j--) + { + C0 = C; + C1 = C0+ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || \ + (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; +#else + temp = off+2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG_FLOAT(&va0, &va1, ptrba, vl); + VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); + + ptrba += vl * 2; + ptrbb += vl * 2; + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax); + C1[1] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 2; +#endif + ptrba += temp*2; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 2; +#endif + C0 = C0+2; + C1 = C1+2; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off; + ptrbb = bb+off*2; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; +#else + temp = off+2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl * 2; + + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 1; +#else + temp -= 2; +#endif + ptrba += temp; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+1; + C1 = C1+1; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off; +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || \ + (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; +#else + temp = off+1; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + vb0 = VLEV_FLOAT(ptrbb, vl); + VLSEG_FLOAT(&va0, &va1, ptrba, vl); + + vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); + vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl); + + ptrba += vl * 2; + ptrbb += vl; + + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); + C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 1; +#endif + ptrba += temp*2; + ptrbb += temp; +#endif +#ifdef LEFT + off += 2; +#endif + + C0 = C0+2; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off; + ptrbb = bb+off; +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off + 1; +#else + temp = off + 1; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); + ptrba += vl; + ptrbb += vl; + } + v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 1; +#else + temp -= 1; +#endif + ptrba += temp; + ptrbb += temp; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+1; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + k = (bk<<0); + bb = bb+k; + C = C+ldc; + } + return 0; +} + diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c new file mode 100644 index 000000000..3e46c6348 --- /dev/null +++ b/kernel/riscv64/trmmkernel_4x4_rvv.c @@ -0,0 +1,881 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEG4_FLOAT vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMUL_FLOAT vfmul_vv_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT vfmacc_vv_f32m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEG4_FLOAT vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMUL_FLOAT vfmul_vv_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT vfmacc_vv_f64m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_4x4.c + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; + FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0; + v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vl; + size_t vlmax = VSETVL_MAX; + + FLOAT_V_T vres0_0; + FLOAT_V_T vres0_1; + FLOAT_V_T vres0_2; + FLOAT_V_T vres0_3; + + FLOAT_V_T vres1_0; + FLOAT_V_T vres1_1; + FLOAT_V_T vres1_2; + FLOAT_V_T vres1_3; + + FLOAT_V_T vres2_0; + FLOAT_V_T vres2_1; + FLOAT_V_T vres2_2; + FLOAT_V_T vres2_3; + + FLOAT_V_T vres3_0; + FLOAT_V_T vres3_1; + FLOAT_V_T vres3_2; + FLOAT_V_T vres3_3; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); + vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); + vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl); + vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); + vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl); + vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl); + + ptrba += vl * 4; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax); + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + if (!backwards) { + temp = bk-off; + temp = left ? temp - 4 : // number of values in A + temp - 4; // number of values in B + + ptrba += temp*4; // number of values in A + ptrbb += temp*4; // number of values in B + } +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + C1 = C1+4; + C2 = C2+4; + C3 = C3+4; + + } + + if ( bm & 2 ) // do any 2x4 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*4; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres1_1 = VFMVVF_FLOAT(0, vlmax); + + vres2_0 = VFMVVF_FLOAT(0, vlmax); + vres2_1 = VFMVVF_FLOAT(0, vlmax); + + vres3_0 = VFMVVF_FLOAT(0, vlmax); + vres3_1 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+4; // number of values in B +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); + vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); + + ptrba += vl * 2; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax); + + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + C1 = C1+2; + C2 = C2+2; + C3 = C3+2; + + } + + if ( bm & 1 ) // do any 1x4 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*4; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres2_0 = VFMVVF_FLOAT(0, vlmax); + vres3_0 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); + vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); + + ptrba += vl; + ptrbb += vl * 4; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); + C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + C1 = C1+1; + C2 = C2+1; + C3 = C3+1; + + } + + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + k = (bk<<2); + bb = bb+k; + i = (ldc<<2); + C = C+i; + } + + for (j=0; j<(bn&2); j+=2) // do the Mx2 loops + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i=0; i 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); + + ptrba += vl * 4; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*4; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + C1 = C1+4; + + } + + if ( bm & 2 ) // do any 2x2 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + + vres1_0 = VFMVVF_FLOAT(0, vlmax); + vres1_1 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+2; // number of values in B +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); + + ptrba += vl * 2; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); + + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); + C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + C1 = C1+2; + + } + + if ( bm & 1 ) // do any 1x2 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*2; +#endif + + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres1_0 = VFMVVF_FLOAT(0, vlmax); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+2; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); + + ptrba += vl; + ptrbb += vl * 2; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + C1 = C1+1; + + } + + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + + for (j=0; j<(bn&1); j+=1) // do the Mx1 loops + { + C0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i=0; i 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + + vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); + + vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); + + ptrba += vl * 4; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); + vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); + C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 4; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*4; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 4; // number of values in A +#endif + + C0 = C0+4; + + } + + if ( bm & 2 ) // do any 2x1 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*1; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + vres0_1 = VFMVVF_FLOAT(0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+2; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); + + ptrba += vl * 2; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*2; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 2; // number of values in A +#endif + + C0 = C0+2; + + } + + if ( bm & 1 ) // do any 1x1 loop + { + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*1; + ptrbb = bb + off*1; +#endif + + vres0_0 = VFMVVF_FLOAT(0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+1; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + va0 = VLEV_FLOAT(ptrba, vl); + vb0 = VLEV_FLOAT(ptrbb, vl); + + vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); + + ptrba += vl; + ptrbb += vl; + } + + vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); + C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*1; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += 1; // number of values in A +#endif + + C0 = C0+1; + + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + + k = (bk<<0); + bb = bb+k; + C = C+ldc; + } + return 0; +} diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c new file mode 100644 index 000000000..97b14650c --- /dev/null +++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c @@ -0,0 +1,685 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#endif + + +// Optimizes the implementation in ../generic/trmmkernel_8x8.c + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + //fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc); + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + size_t vl; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + for (j = bn/8; j > 0; j--) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + C4 = C3+ldc; + C5 = C4+ldc; + C6 = C5+ldc; + C7 = C6+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*8; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+8; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + ptrbb += 8; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); + ptrbb += 8; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); + ptrbb += 8; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); + ptrbb += 8; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); + ptrbb += 8; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); + ptrbb += 8; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); + ptrbb += 8; + } + + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); + vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); + vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); + vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + + va4 = VFMULVF_FLOAT(vres4, alpha, vl); + VSEV_FLOAT(C4, va4, vl); + + va5 = VFMULVF_FLOAT(vres5, alpha, vl); + VSEV_FLOAT(C5, va5, vl); + + va6 = VFMULVF_FLOAT(vres6, alpha, vl); + VSEV_FLOAT(C6, va6, vl); + + va7 = VFMULVF_FLOAT(vres7, alpha, vl); + VSEV_FLOAT(C7, va7, vl); + + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 8; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*8; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + C4 += vl; + C5 += vl; + C6 += vl; + C7 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; +#endif + + bb += (bk<<3); + C += (ldc<<3); + } + + if (bn & 4) + { + C0 = C; + C1 = C0+ldc; + C2 = C1+ldc; + C3 = C2+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*4; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + ptrbb += 4; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); + ptrbb += 4; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); + ptrbb += 4; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); + ptrbb += 4; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); + ptrbb += 4; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); + ptrbb += 4; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); + ptrbb += 4; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); + ptrbb += 4; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); + vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + ptrba += vl; + } + + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alpha, vl); + VSEV_FLOAT(C2, va2, vl); + + va3 = VFMULVF_FLOAT(vres3, alpha, vl); + VSEV_FLOAT(C3, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*4; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + C2 += vl; + C3 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk<<2); + C += (ldc<<2); + } + + if (bn & 2) + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); + ptrbb += 2; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); + ptrbb += 2; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); + ptrbb += 2; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); + ptrbb += 2; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); + ptrbb += 2; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + + va1 = VFMULVF_FLOAT(vres1, alpha, vl); + VSEV_FLOAT(C1, va1, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + C1 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + + bb += (bk<<1); + C += (ldc<<1); + } + + if (bn & 1) + { + C0 = C; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + ptrba = ba; + + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl; + ptrbb = bb + off*1; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + + for (k = temp/8; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + va1 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + ptrbb += 1; + va2 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); + ptrbb += 1; + va3 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); + ptrbb += 1; + va4 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); + ptrbb += 1; + va5 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); + ptrbb += 1; + va6 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); + ptrbb += 1; + va7 = VLEV_FLOAT(ptrba, vl); + ptrba += vl; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); + ptrbb += 1; + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); + ptrbb += 1; + } + + // K remainder + for (k = temp&7; k > 0; k--) { + va0 = VLEV_FLOAT(ptrba, vl); + + vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); + + ptrbb += 1; + ptrba += vl; + } + va0 = VFMULVF_FLOAT(vres0, alpha, vl); + VSEV_FLOAT(C0, va0, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl; + ptrbb += temp*1; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + + bb += (bk); + C += (ldc); + } + return 0; +} + diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c new file mode 100644 index 000000000..11a0398ca --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -0,0 +1,847 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pa, *pc; + + int i, j, k; + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + + size_t vl; + FLOAT_V_T va, vc; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + for (j = 0; j < n; j ++) + { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + pa = a; + pc = c + j * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLEV_FLOAT(pc, vl); + va = VLEV_FLOAT(pa, vl); + vc = VFNMSACVF_FLOAT(vc, bb, va, vl); + VSEV_FLOAT(pc, vc, vl); + pa += vl; + pc += vl; + } + } + a -= m; + b -= 2 * n; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1; + FLOAT *pa, *pc, *pc0, *pc1; + FLOAT *pb0, *pb1; + + int i, j, k; + fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + + size_t vl; + FLOAT_V_T va, vc0, vc1; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/2; j ++) + { + //bb = *(c + i + j * ldc); + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + //bb *= aa; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + //*b = bb; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + + //*(c + i + j * ldc) = bb; + //b ++; + + b += 2; + //pa = a + i + 1; + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + pa = a; + //pc = c + j * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + } + pc += ldc * (n/2) * 2; + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1, bb2, bb3; + FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3; + FLOAT *pb0, *pb1, *pb2, *pb3; + + int i, j, k; + + size_t vl; + FLOAT_V_T va, vc0, vc1, vc2, vc3; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/4; j ++) + { + pb0 = pc + j * ldc * 4; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + + b += 4; + + pc0 = c + j * ldc * 4; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + } + pc += ldc * (n/4) * 4; + + if (n & 2) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + + *b = bb0; + *(b+1) = bb1; + + *pb0 = bb0; + *pb1 = bb1; + + b += 2; + + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; + FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + + int i, j, k; + + size_t vl; + FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/8; j ++) + { + pb0 = pc + j * ldc * 8; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + pb4 = pb3 + ldc; + pb5 = pb4 + ldc; + pb6 = pb5 + ldc; + pb7 = pb6 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + bb4 = (*pb4) * aa; + bb5 = (*pb5) * aa; + bb6 = (*pb6) * aa; + bb7 = (*pb7) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + *(b+4) = bb4; + *(b+5) = bb5; + *(b+6) = bb6; + *(b+7) = bb7; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + *pb4 = bb4; + *pb5 = bb5; + *pb6 = bb6; + *pb7 = bb7; + + b += 8; + + pc0 = c + j * ldc * 8; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pc4 = pc3 + ldc; + pc5 = pc4 + ldc; + pc6 = pc5 + ldc; + pc7 = pc6 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + vc4 = VLEV_FLOAT(pc4, vl); + vc5 = VLEV_FLOAT(pc5, vl); + vc6 = VLEV_FLOAT(pc6, vl); + vc7 = VLEV_FLOAT(pc7, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); + vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); + vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); + vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + VSEV_FLOAT(pc4, vc4, vl); + VSEV_FLOAT(pc5, vc5, vl); + VSEV_FLOAT(pc6, vc6, vl); + VSEV_FLOAT(pc7, vc7, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + pc4 += vl; + pc5 += vl; + pc6 += vl; + pc7 += vl; + } + } + pc += ldc * (n/8) * 8; + + if (n & 4) + { + pb0 = pc + j * ldc * 4; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + + b += 4; + + pc0 = c + j * ldc * 4; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + pc += ldc * 4; + } + + if (n & 2) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + + *b = bb0; + *(b+1) = bb1; + + *pb0 = bb0; + *pb1 = bb1; + + b += 2; + + pc0 = c + j * ldc * 2; + pc1 = pc0 + ldc; + + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = (*pb0) * aa; + *b = bb0; + *pb0 = bb0; + b += 1; + + pc0 = pc - i; + pa = a; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + + pa += vl; + pc0 += vl; + } + } + + a -= m; + b -= 2 * n; + } + +} +#else +static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + i = m % vl; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = vl; + if (i <= m) { + aa = a + (m - mod - vl) * k * COMPSIZE; + cc = c + (m - mod - vl) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - vl) * vl * COMPSIZE, + b + (kk - vl) * j * COMPSIZE, + cc, ldc); + + aa -= vl * k * COMPSIZE; + cc -= vl * COMPSIZE; + kk -= vl; + + i += vl; + } while (i <= m); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c new file mode 100644 index 000000000..0380bd1bb --- /dev/null +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -0,0 +1,840 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c + +#ifndef COMPLEX +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + FLOAT aa, bb; + FLOAT *pa, *pc; + + int i, j, k; + size_t vl; + FLOAT_V_T va, vc; + for (i = 0; i < m; i++) + { + aa = *(a + i); + for (j = 0; j < n; j ++) + { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b++; + pa = a + i + 1; + pc = c + j * ldc + i + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLEV_FLOAT(pc, vl); + va = VLEV_FLOAT(pa, vl); + vc = VFNMSACVF_FLOAT(vc, bb, va, vl); + VSEV_FLOAT(pc, vc, vl); + pa += vl; + pc += vl; + } + } + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1; + FLOAT *pa, *pc, *pc0, *pc1; + FLOAT *pb0, *pb1; + + int i, j, k; + size_t vl; + FLOAT_V_T va, vc0, vc1; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/2; j ++) + { + pb0 = pc + j * ldc * 2; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + } + pc += ldc * (n/2) * 2; + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1, bb2, bb3; + FLOAT *pa, *pc; + FLOAT *pc0, *pc1, *pc2, *pc3; + FLOAT *pb0, *pb1, *pb2, *pb3; + + int i, j, k; + size_t vl; + FLOAT_V_T va; + FLOAT_V_T vc0, vc1, vc2, vc3; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/4; j ++) + { + pb0 = pc; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + b += 4; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + } + pc += ldc * (n/4) * 4; + + if (n & 2) + { + pb0 = pc; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + + FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; + FLOAT *pa, *pc; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + + int i, j, k; + size_t vl; + FLOAT_V_T va; + FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + for (i = 0; i < m; i++) + { + aa = *(a + i); + pc = c + i; + for (j = 0; j < n/8; j ++) + { + pb0 = pc + j * ldc * 8; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + pb4 = pb3 + ldc; + pb5 = pb4 + ldc; + pb6 = pb5 + ldc; + pb7 = pb6 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + bb4 = (*pb4) * aa; + bb5 = (*pb5) * aa; + bb6 = (*pb6) * aa; + bb7 = (*pb7) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + *(b+4) = bb4; + *(b+5) = bb5; + *(b+6) = bb6; + *(b+7) = bb7; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + *pb4 = bb4; + *pb5 = bb5; + *pb6 = bb6; + *pb7 = bb7; + b += 8; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + pc4 = pc3 + ldc; + pc5 = pc4 + ldc; + pc6 = pc5 + ldc; + pc7 = pc6 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + vc4 = VLEV_FLOAT(pc4, vl); + vc5 = VLEV_FLOAT(pc5, vl); + vc6 = VLEV_FLOAT(pc6, vl); + vc7 = VLEV_FLOAT(pc7, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); + vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); + vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); + vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + VSEV_FLOAT(pc4, vc4, vl); + VSEV_FLOAT(pc5, vc5, vl); + VSEV_FLOAT(pc6, vc6, vl); + VSEV_FLOAT(pc7, vc7, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + pc4 += vl; + pc5 += vl; + pc6 += vl; + pc7 += vl; + } + } + pc += ldc * (n/8) * 8; + + if (n & 4) + { + pb0 = pc; + pb1 = pb0 + ldc; + pb2 = pb1 + ldc; + pb3 = pb2 + ldc; + + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + bb2 = (*pb2) * aa; + bb3 = (*pb3) * aa; + + *b = bb0; + *(b+1) = bb1; + *(b+2) = bb2; + *(b+3) = bb3; + + *pb0 = bb0; + *pb1 = bb1; + *pb2 = bb2; + *pb3 = bb3; + b += 4; + + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + vc2 = VLEV_FLOAT(pc2, vl); + vc3 = VLEV_FLOAT(pc3, vl); + + va = VLEV_FLOAT(pa, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); + vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); + + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + VSEV_FLOAT(pc2, vc2, vl); + VSEV_FLOAT(pc3, vc3, vl); + + pa += vl; + pc0 += vl; + pc1 += vl; + pc2 += vl; + pc3 += vl; + } + pc += ldc * 4; + } + + if (n & 2) + { + pb0 = pc; + pb1 = pb0 + ldc; + bb0 = (*pb0) * aa; + bb1 = (*pb1) * aa; + *b = bb0; + *(b+1) = bb1; + *pb0 = bb0; + *pb1 = bb1; + b += 2; + pa = a + i + 1; + pc0 = pb0 + 1; + pc1 = pc0 + ldc; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + vc1 = VLEV_FLOAT(pc1, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + VSEV_FLOAT(pc1, vc1, vl); + pa += vl; + pc0 += vl; + pc1 += vl; + } + pc += ldc * 2; + } + + if (n & 1) + { + pb0 = pc; + bb0 = *(pb0); + bb0 *= aa; + *b = bb0; + *(c + i) = bb0; + b++; + pa = a + i + 1; + pc0 = pb0 + 1; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLEV_FLOAT(pc0, vl); + va = VLEV_FLOAT(pa, vl); + vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); + VSEV_FLOAT(pc0, vc0, vl); + pa += vl; + pc0 += vl; + } + } + + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + + +static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + FLOAT *pa, *pc; + + int i, j, k; + + size_t vl; + FLOAT_V_T va0, va1, vc0, vc1; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + pa = a + (i + 1) * 2; + pc = c + j * ldc + (i + 1) * 2; + for (k = (m - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, pa, vl); + VLSEG2_FLOAT(&vc0, &vc1, pc, vl); +#ifndef CONJ + vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); + vc0 = VFMACCVF_FLOAT(vc0, cc2, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); +#else + vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); + vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1); + vc1 = VFMACCVF_FLOAT(vc1, cc1, va1); + vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); +#endif + VSSEG2_FLOAT(pc, vc0, vc1, vl); + pa += vl * 2; + pc += vl * 2; + } + } + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + kk += vl; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c new file mode 100644 index 000000000..41368be60 --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -0,0 +1,792 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pb, *pc; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + + for (j = 0; j < m; j ++) + { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + pb = b + i + 1; + pc = c + j + (i + 1) *ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLSEV_FLOAT(pc, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); + VSSEV_FLOAT(pc, stride_ldc, vc, vl); + pb += vl; + pc ++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pc0, *pc1; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/2; j ++) + { + pa0 = pc + j * 2; + pa1 = pc + j * 2 + 1; + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + pb += vl; + pc0++; + pc1++; + } + } + pc += (m/2)*2; + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT aa0, aa1, aa2, aa3; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *pc0, *pc1, *pc2, *pc3; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/4; j ++) + { + pa0 = pc + j * 4; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + + a += 4; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + } + pc += (m/4)*4; + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT bb; + FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + for (i = 0; i < n; i++) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/8; j ++) + { + pa0 = pc + j * 8; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + pa4 = pa3 + 1; + pa5 = pa4 + 1; + pa6 = pa5 + 1; + pa7 = pa6 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + aa4 = *pa4 * bb; + aa5 = *pa5 * bb; + aa6 = *pa6 * bb; + aa7 = *pa7 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + *pa4 = aa4; + *pa5 = aa5; + *pa6 = aa6; + *pa7 = aa7; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + *(a + 4)= aa4; + *(a + 5)= aa5; + *(a + 6)= aa6; + *(a + 7)= aa7; + + a += 8; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + pc4 = pa4 + ldc; + pc5 = pa5 + ldc; + pc6 = pa6 + ldc; + pc7 = pa7 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); + vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); + vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); + vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); + vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); + vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); + vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); + VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); + VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); + VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + pc4++; + pc5++; + pc6++; + pc7++; + } + } + pc += (m/8)*8; + + if (m & 4) + { + pa0 = pc; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + + a += 4; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + pc2 = pa2 + ldc; + pc3 = pa3 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + pc += 4; + } + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + + a += 2; + + pb = b + i + 1; + pc0 = pa0 + ldc; + pc1 = pa1 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b + i + 1; + pc0 = pa0 + ldc; + for (k = (n - i - 1); k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b += n; + } +} +#else +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = vl; + + if (i <= m) { + do { + if (kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + kk * vl * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = vl; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(vl, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(vl, j, + aa + kk * vl * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } + + i = m % vl; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c new file mode 100644 index 000000000..459c1663a --- /dev/null +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -0,0 +1,828 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSSEV_FLOAT vsse32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSSEV_FLOAT vsse64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c + +#ifndef COMPLEX + +#if GEMM_DEFAULT_UNROLL_N == 1 +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *pb, *pc; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + pb = b; + pc = c + j; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc = VLSEV_FLOAT(pc, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); + VSSEV_FLOAT(pc, stride_ldc, vc, vl); + pb += vl; + pc++; + } + } + b -= n; + a -= 2 * m; + } + +} +#elif GEMM_DEFAULT_UNROLL_N == 2 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pc0, *pc1; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/2; j ++) + { + pa0 = pc + j * 2; + pa1 = pc + j * 2 + 1; + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = c + j * 2; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + pb += vl; + pc0++; + pc1++; + } + } + pc += (m/2)*2; + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} + +#elif GEMM_DEFAULT_UNROLL_N == 4 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, aa2, aa3; + FLOAT bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3; + FLOAT *pc0, *pc1, *pc2, *pc3; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/4; j ++) + { + pa0 = pc + j * 4; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + a += 4; + + pb = b; + pc0 = c + j * 4; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + } + pc += (m/4)*4; + + if (m & 2) + { + pa0 = pc + j * 2; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = c + j * 4; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} +#elif GEMM_DEFAULT_UNROLL_N == 8 + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; + FLOAT bb; + FLOAT *pb, *pc; + FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; + FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; + int i, j, k; + size_t vl; + FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) + { + bb = *(b + i); + pc = c + i * ldc; + for (j = 0; j < m/8; j ++) + { + pa0 = pc + j * 8; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + pa4 = pa3 + 1; + pa5 = pa4 + 1; + pa6 = pa5 + 1; + pa7 = pa6 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + aa4 = *pa4 * bb; + aa5 = *pa5 * bb; + aa6 = *pa6 * bb; + aa7 = *pa7 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + *pa4 = aa4; + *pa5 = aa5; + *pa6 = aa6; + *pa7 = aa7; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + *(a + 4)= aa4; + *(a + 5)= aa5; + *(a + 6)= aa6; + *(a + 7)= aa7; + a += 8; + + pb = b; + pc0 = c + j * 8; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + pc4 = pc3 + 1; + pc5 = pc4 + 1; + pc6 = pc5 + 1; + pc7 = pc6 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); + vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); + vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); + vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); + vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); + vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); + vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); + VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); + VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); + VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + pc4++; + pc5++; + pc6++; + pc7++; + } + } + pc += (m/8)*8; + + if (m & 4) + { + pa0 = pc; + pa1 = pa0 + 1; + pa2 = pa1 + 1; + pa3 = pa2 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + aa2 = *pa2 * bb; + aa3 = *pa3 * bb; + + *pa0 = aa0; + *pa1 = aa1; + *pa2 = aa2; + *pa3 = aa3; + + *a = aa0; + *(a + 1)= aa1; + *(a + 2)= aa2; + *(a + 3)= aa3; + a += 4; + + pb = b; + pc0 = pc - i * ldc; + pc1 = pc0 + 1; + pc2 = pc1 + 1; + pc3 = pc2 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); + vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); + vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); + VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); + + pb += vl; + pc0++; + pc1++; + pc2++; + pc3++; + } + pc += 4; + } + + if (m & 2) + { + pa0 = pc; + pa1 = pa0 + 1; + + aa0 = *pa0 * bb; + aa1 = *pa1 * bb; + + *pa0 = aa0; + *pa1 = aa1; + + *a = aa0; + *(a + 1)= aa1; + a += 2; + + pb = b; + pc0 = pc - i * ldc; + pc1 = pc0 + 1; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); + + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); + + pb += vl; + pc0++; + pc1++; + } + pc += 2; + } + + if (m & 1) + { + pa0 = pc; + aa0 = *pa0 * bb; + + *pa0 = aa0; + *a = aa0; + a += 1; + + pb = b; + pc0 = pc - i * ldc; + for (k = i; k > 0; k -= vl) + { + vl = VSETVL(k); + vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); + vb = VLEV_FLOAT(pb, vl); + vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); + VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); + pb += vl; + pc0++; + } + } + b -= n; + a -= 2 * m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#endif + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + + size_t vl = VSETVL_MAX; + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, j, + aa + (kk - j) * vl * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = vl; + if (i <= m) { + do { + if (k - kk > 0) { + GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + vl * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(vl, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += vl * k * COMPSIZE; + cc += vl * COMPSIZE; + i += vl; + } while (i <= m); + } + + i = m % vl; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c new file mode 100644 index 000000000..bacfb2b08 --- /dev/null +++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 + +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + *(b + j) = INV(*(ao + j * lda)); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..0fc7c9f24 --- /dev/null +++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j)); + + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + + b += vl; + ao += lda; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c new file mode 100644 index 000000000..ee869a795 --- /dev/null +++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda; + + FLOAT *ao; + jj = offset; + + FLOAT_V_T va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + *(b + j) = INV(*(ao + j * lda)); + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + ao++; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + va1 = VLSEV_FLOAT(ao, stride_lda, vl); + VSEV_FLOAT(b, va1, vl); + } + ao++; + b += vl; + i++; + ii++; + } + } + + a += vl * lda; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c new file mode 100644 index 000000000..a324b0fa6 --- /dev/null +++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VSEV_FLOAT_M vse32_v_f32m2_m +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VSEV_FLOAT_M vse64_v_f64m2_m +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#endif + + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_V_T va1; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + va1 = VLEV_FLOAT(ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSEV_FLOAT_M(vbool_cmp, b, va1, vl); + *(b + j) = INV(*(ao + j)); + + ao += lda; + b += vl; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + va1 = VLEV_FLOAT(ao, vl); + VSEV_FLOAT(b, va1, vl); + } + ao += lda; + b += vl; + i ++; + ii ++; + } + } + + a += vl; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c new file mode 100644 index 000000000..1917042be --- /dev/null +++ b/kernel/riscv64/zamax_rvv.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + FLOAT_V_T v0, v1, vmax; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmax = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT(vmax, v0, vl); + + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmax = VFMAXVV_FLOAT(vmax, v0, vl); + } + + } + + v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + maxf = VFMVFS_FLOAT_M1(v_res); + + return(maxf); +} diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c new file mode 100644 index 000000000..3f027383a --- /dev/null +++ b/kernel/riscv64/zamin_rvv.c @@ -0,0 +1,112 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFABSV_FLOAT vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFABSV_FLOAT vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + FLOAT_V_T v0, v1, vmin; + FLOAT_V_T_M1 v_res; + + v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); + size_t vlmax = VSETVL_MAX; + vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT(vmin, v0, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v0 = VFADDVV_FLOAT(v0, v1, vl); + vmin = VFMINVV_FLOAT(vmin, v0, vl); + } + + } + + v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + minf = VFMVFS_FLOAT_M1(v_res); + + return(minf); +} diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c new file mode 100644 index 000000000..7876646b3 --- /dev/null +++ b/kernel/riscv64/zasum_rvv.c @@ -0,0 +1,108 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VFABSV_FLOAT vfabs_v_f32m8 +#else +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VFABSV_FLOAT vfabs_v_f64m8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asumf = 0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + + FLOAT_V_T v0, v1; + size_t vlmax = VSETVL_MAX; + FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + v0 = VLEV_FLOAT(x, vl); + v1 = VLEV_FLOAT(x+vl, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + else { + + int stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + v0 = VLSEV_FLOAT(x, stride_x, vl); + v1 = VLSEV_FLOAT(x+1, stride_x, vl); + + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + asumf += VFMVFS_FLOAT_M1(v_res); + + return(asumf); +} diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c new file mode 100644 index 000000000..66f52d9d0 --- /dev/null +++ b/kernel/riscv64/zaxpby_rvv.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/*************************************************************************** +* 2014/06/07 Saar +* +***************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFMSACVF_FLOAT vfmsac_vf_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFMSACVF_FLOAT vfmsac_vf_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) +{ + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + BLASLONG stride_x = inc_x2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y2 * sizeof(FLOAT); + FLOAT_V_T vx0, vx1, vy0, vy1; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + size_t vl = VSETVL(n); + FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl); + for ( ; n > 0; n -= vl, y += vl*stride_y) + { + vl = VSETVL(n); + VSSSEG_FLOAT(y, stride_y, temp, temp, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); + vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); + + vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); + + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + } + else + { + FLOAT_V_T v0, v1; + + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vy1, beta_i, vl); + v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); + + v1 = VFMULVF_FLOAT(vy1, beta_r, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + } + } + else + { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) + { + vl = VSETVL(n); + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); + v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); + v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl); + v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl); + + v1 = VFMULVF_FLOAT(vx1, alpha_r, vl); + v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl); + v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); + v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); + + VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + } + } + } + return(0); + +} diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c new file mode 100644 index 000000000..777bcb728 --- /dev/null +++ b/kernel/riscv64/zaxpy_rvv.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if(n < 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); + + FLOAT_V_T vx0, vx1, vy0, vy1; + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else if (inc_x == 1) { + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } else if (inc_y == 1) { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + #if !defined(CONJ) + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #else + vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); + vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); + vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); + #endif + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c new file mode 100644 index 000000000..5d8322bbb --- /dev/null +++ b/kernel/riscv64/zcopy_rvv.c @@ -0,0 +1,105 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL_M8(n) vsetvl_e32m8(n) +#define FLOAT_V_T_M8 vfloat32m8_t +#define VLEV_FLOAT_M8 vle32_v_f32m8 +#define VSEV_FLOAT_M8 vse32_v_f32m8 + +#define VSETVL_M4(n) vsetvl_e32m4(n) +#define FLOAT_V_T_M4 vfloat32m4_t +#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4 +#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4 +#else +#define VSETVL_M8(n) vsetvl_e64m8(n) +#define FLOAT_V_T_M8 vfloat64m8_t +#define VLEV_FLOAT_M8 vle64_v_f64m8 +#define VSEV_FLOAT_M8 vse64_v_f64m8 + +#define VSETVL_M4(n) vsetvl_e64m4(n) +#define FLOAT_V_T_M4 vfloat64m4_t +#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4 +#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if(n < 0) return(0); + + if(inc_x == 1 && inc_y == 1) { + + FLOAT_V_T_M8 vx; + n *= 2; // convert to words + + for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = VSETVL_M8(n); + vx = VLEV_FLOAT_M8(x, vl); + VSEV_FLOAT_M8(y, vx, vl); + } + + }else if (1 == inc_x) { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + VLSEG_FLOAT_M4(&vr, &vi, x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + } + } else if (1 == inc_y) { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL_M4(n); + VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); + VSSEG_FLOAT_M4(y, vr, vi, vl); + } + } else { + + FLOAT_V_T_M4 vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL_M4(n); + VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + } + } + + return(0); +} diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c new file mode 100644 index 000000000..7eae6f608 --- /dev/null +++ b/kernel/riscv64/zdot_rvv.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFMSACVV_FLOAT vfmsac_vv_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFMSACVV_FLOAT vfmsac_vv_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + OPENBLAS_COMPLEX_FLOAT result; + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + + if ( n <= 0 ) return(result); + + FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; + FLOAT_V_T_M1 v_res, v_z0; + size_t vlmax_m1 = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax_m1); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); + + size_t vlmax = VSETVL_MAX; + vr0 = VFMVVF_FLOAT(0, vlmax); + vr1 = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + + } else if (inc_x == 1){ + + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + } else if (inc_y == 1){ + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + }else { + + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + #if !defined(CONJ) + vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + #else + vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + #endif + } + } + + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax); + CREAL(result) = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax); + CIMAG(result) = VFMVFS_FLOAT_M1(v_res); + + return(result); +} diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c new file mode 100644 index 000000000..a89752d18 --- /dev/null +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VFSUBVV_FLOAT vfsub_vv_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VFSUBVV_FLOAT vfsub_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, + FLOAT beta_r, FLOAT beta_i, + FLOAT *dummy2, BLASLONG dummy3, + FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc) +{ + BLASLONG chunk; + FLOAT *c_offset; + size_t vl; + FLOAT_V_T vr, vi, v1, v2, v3, v4; + + ldc *= 2; + c_offset = c; + + if (beta_r == 0.0 && beta_i == 0.0) { + + vl = VSETVL(m); + vr = VFMVVF_FLOAT(0.0, vl); + vi = VFMVVF_FLOAT(0.0, vl); + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + VSSEG_FLOAT(c_offset, vr, vi, vl); + } + } + + } else { + + for( ; n > 0; n--, c += ldc) { + c_offset = c; + + for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { + vl = VSETVL(chunk); + + VLSEG_FLOAT(&vr, &vi, c_offset, vl); + + v1 = VFMULVF_FLOAT(vr, beta_r, vl); + v2 = VFMULVF_FLOAT(vi, beta_i, vl); + + v3 = VFMULVF_FLOAT(vi, beta_r, vl); + v4 = VFMULVF_FLOAT(vr, beta_i, vl); + + vr = VFSUBVV_FLOAT(v1, v2, vl); + vi = VFADDVV_FLOAT(v3, v4, vl); + + VSSEG_FLOAT(c_offset, vr, vi, vl); + } + } + + } + + return 0; +} diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c new file mode 100644 index 000000000..2eeb61b45 --- /dev/null +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix; + FLOAT *a_ptr; + FLOAT temp_r, temp_i; + FLOAT_V_T va0, va1, vy0, vy1; + + BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; + + BLASLONG inc_x2 = inc_x * 2; + BLASLONG lda2 = lda * 2; + if (inc_y == 1) + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + VLSEG_FLOAT(&va0, &va1, a_ptr, vl); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } + else + { + for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) { + vl = VSETVL(m); + a_ptr = a; + ix = 0; + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + for(i = 0; i < n; i++){ +#if !defined(XCONJ) + temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; +#else + temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; + temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; +#endif + + VLSEG_FLOAT(&va0, &va1, a_ptr, vl); +#if !defined(CONJ) +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#else +#if !defined(XCONJ) + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl); +#endif +#endif + a_ptr += lda2; + ix += inc_x2; + } + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + return(0); +} diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c new file mode 100644 index 000000000..b682d5cd8 --- /dev/null +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0, iy = 0; + FLOAT *a_ptr = a; + FLOAT temp_r, temp_i; + + FLOAT_V_T va0, va1, vx0, vx1, vr, vi; + FLOAT_V_T_M1 v_res, v_z0; + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + //BLASLONG stride_a = sizeof(FLOAT) * 2; + BLASLONG inc_y2 = inc_y * 2; + BLASLONG lda2 = lda * 2; + + size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + vlmax = VSETVL(m); + + if (inc_x == 1) + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); + VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + } + else + { + for(i = 0; i < n; i++) { + j = 0; + ix = 0; + vr = VFMVVF_FLOAT(0, vlmax); + vi = VFMVVF_FLOAT(0, vlmax); + for(size_t vl, k = m; k > 0; k -= vl) { + vl = VSETVL(k); + + VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); + VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); +#else + vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); +#endif + j += vl * 2; + ix += vl * inc_x * 2; + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + temp_r = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + temp_i = VFMVFS_FLOAT_M1(v_res); + +#if !defined(XCONJ) + y[iy] += alpha_r * temp_r - alpha_i * temp_i; + y[iy+1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y[iy] += alpha_r * temp_r + alpha_i * temp_i; + y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + iy += inc_y2; + a_ptr += lda2; + } + + } + + + return(0); +} diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c new file mode 100644 index 000000000..921ddb8cb --- /dev/null +++ b/kernel/riscv64/znrm2_rvv.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT vfabs_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT vfabs_v_f64m4 +#endif + +// TODO: Should single precision use the widening MAC, or perhaps all should be double? + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + + if ( n <= 0 ) return(0.0); + + FLOAT_V_T vr, v0, v1; + FLOAT_V_T_M1 v_max, v_res; + FLOAT scale = 0.0, ssq = 0.0; + + size_t vlmax = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_max = VFMVVF_FLOAT_M1(0, vlmax); + + vr = VFMVVF_FLOAT(0, vlmax); + + if (inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + v0 = VFABSV_FLOAT(v0, vl); + v1 = VFABSV_FLOAT(v1, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + + v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + } + + } + + v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + + ssq = VFMVFS_FLOAT_M1(v_res); + scale = VFMVFS_FLOAT_M1(v_max); + ssq = ssq / (scale*scale); + + return(scale * sqrt(ssq)); +} diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c new file mode 100644 index 000000000..68066a00b --- /dev/null +++ b/kernel/riscv64/zrot_rvv.c @@ -0,0 +1,181 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + + if (n <= 0) return(0); + + FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; + + if (inc_x == 0 && inc_y == 0) { + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSEG_FLOAT(x, vt0, vt1, vl); + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSEG_FLOAT(x, vt0, vt1, vl); + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); + VSSEG_FLOAT(y, vy0, vy1, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vt0 = VFMULVF_FLOAT(vx0, c, vl); + vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); + vt1 = VFMULVF_FLOAT(vx1, c, vl); + vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl); + vy0 = VFMULVF_FLOAT(vy0, c, vl); + vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl); + vy1 = VFMULVF_FLOAT(vy1, c, vl); + vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); + + VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); + VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + } + } + + return 0; +} diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c new file mode 100644 index 000000000..079c36a2d --- /dev/null +++ b/kernel/riscv64/zscal_rvv.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMULVF_FLOAT vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMULVF_FLOAT vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + + if((n <= 0) || (inc_x <= 0)) return(0); + + FLOAT_V_T vt, vr, vi; + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + size_t vlmax = VSETVL_MAX; + + if(da_r == 0.0 && da_i == 0.0) { + + vr = VFMVVF_FLOAT(0.0, vlmax); + vi = VFMVVF_FLOAT(0.0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VSSEG_FLOAT(x, vr, vi, vl); + } + + } else { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + } + } + + } else if(da_r == 0.0) { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vt = VFMULVF_FLOAT(vi, -da_i, vl); + vi = VFMULVF_FLOAT(vr, da_i, vl); + + VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + } + + } else if(da_i == 0.0) { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vr = VFMULVF_FLOAT(vr, da_r, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + + VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + } + + } else { + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vr, &vi, x, vl); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + VSSEG_FLOAT(x, vt, vi, vl); + } + + } else { + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); + + VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + } + } + } + + return(0); +} diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c new file mode 100644 index 000000000..3928fbe27 --- /dev/null +++ b/kernel/riscv64/zsum_rvv.c @@ -0,0 +1,97 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT vfadd_vv_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT vfadd_vv_f64m4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + FLOAT_V_T v0, v1; + size_t vlmax = VSETVL_MAX; + FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); + + if(inc_x == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&v0, &v1, x, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } else { + + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + + v_sum = VFADDVV_FLOAT(v_sum, v0, vl); + v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + } + + } + + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); + FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + sumf += VFMVFS_FLOAT_M1(v_res); + + return(sumf); +} diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c new file mode 100644 index 000000000..86f9103d3 --- /dev/null +++ b/kernel/riscv64/zswap_rvv.c @@ -0,0 +1,156 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#else +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + + if (n <= 0) return(0); + + FLOAT_V_T vx0, vx1, vy0, vy1; + + if (inc_x == 0 && inc_y == 0) { + if (n & 1) { + FLOAT temp[2]; + temp[0] = x[0]; + temp[1] = x[1]; + x[0] = y[0]; + x[1] = y[1]; + y[0] = temp[0]; + y[1] = temp[1]; + } + else { + return 0; + } + } + else if(inc_x == 0) { + FLOAT temp[2]; + temp[0] = x[0]; + temp[1] = x[1]; + x[0] = y[(n - 1) * inc_y * 2]; + x[0] = y[(n - 1) * inc_y * 2 + 1]; + FLOAT* ptr = y + (n - 1) * inc_y * 2; // start from the last one + BLASLONG stride_y = (0 - inc_y) * sizeof(FLOAT) * 2; // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) { + vl = VSETVL(m); + VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl); + VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl); + } + y[0] = temp[0]; + y[1] = temp[1]; + } + else if(inc_y == 0) { + FLOAT temp[2]; + temp[0] = y[0]; + temp[1] = y[1]; + y[0] = x[(n - 1) * inc_x * 2]; + y[0] = x[(n - 1) * inc_x * 2 + 1]; + FLOAT* ptr = x + (n - 1) * inc_x * 2; // start from the last one + BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse + BLASLONG m = n - 1; + for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { + vl = VSETVL(m); + VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl); + VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl); + } + x[0] = temp[0]; + x[1] = temp[1]; + } + else if(inc_x == 1 && inc_y == 1) { + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + VSSEG_FLOAT(y, vx0, vx1, vl); + VSSEG_FLOAT(x, vy0, vy1, vl); + } + + } else if (inc_x == 1){ + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSEG_FLOAT(&vx0, &vx1, x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); + VSSEG_FLOAT(x, vy0, vy1, vl); + } + + } else if (inc_y == 1){ + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSEG_FLOAT(&vy0, &vy1, y, vl); + + VSSEG_FLOAT(y, vx0, vx1, vl); + VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + } + + } else { + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { + vl = VSETVL(n); + + VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); + VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + } + + } + + return(0); +} diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c new file mode 100644 index 000000000..3486a4648 --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c @@ -0,0 +1,596 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define VSETVL_MAX_M1 vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEG4_FLOAT vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT vfmacc_vv_f32m2 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define VSETVL_MAX_M1 vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEG4_FLOAT vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT vfmacc_vv_f64m2 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2 +#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#endif + +// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c + + +/******************************** + ADD1 a*c + ADD2 b*c + ADD3 a*d + ADD4 b*d + *********************************/ +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb, + FLOAT* C,BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + FLOAT res0,res1; + BLASLONG off, temp; + + FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + FLOAT_V_T_M1 v_m1_res0, v_m1_res1; + FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); + + size_t vl; + size_t vlmax = VSETVL_MAX; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j = bn/2; j > 0; j--) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + C0 = C; + C1 = C0+2*ldc; + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + vres4 = VFMVVF_FLOAT(0.0, vlmax); + vres5 = VFMVVF_FLOAT(0.0, vlmax); + vres6 = VFMVVF_FLOAT(0.0, vlmax); + vres7 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 2; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl); +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + + vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl); + vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl); + vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl); + + vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl); + vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl); + vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 4; + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[2] = res0 * alphar - res1 * alphai; + C1[3] = res1 * alphar + res0 * alphai; +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 2; +#else + temp -= 2; +#endif + + ptrba += temp*2*2; + ptrbb += temp*2*2; + +#endif + +#ifdef LEFT + off += 2; +#endif + + C0 = C0+4; + C1 = C1+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off+1; +#else + temp = off+2; +#endif + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 4; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C1[0] = res0 * alphar - res1 * alphai; + C1[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 2; +#endif + ptrba += temp*2; + ptrbb += temp*2*2; +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + C1 = C1+2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + k = (bk<<2); + bb = bb+k; + i = (ldc<<2); + C = C+i; + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + + for (i = bm/2; i > 0; i--) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2*2; + ptrbb = bb+off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + vres2 = VFMVVF_FLOAT(0.0, vlmax); + vres3 = VFMVVF_FLOAT(0.0, vlmax); +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk - off; +#elif defined(LEFT) + temp = off + 2; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + + vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl); + vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl); + vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl); + +#endif + ptrba += vl * 4; + ptrbb += vl * 2; + } + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + C0[2] = res0 * alphar - res1 * alphai; + C0[3] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk-off; +#ifdef LEFT + temp -= 2; +#else + temp -= 1; +#endif + ptrba += temp*2*2; + ptrbb += temp*2; +#endif +#ifdef LEFT + off += 2; +#endif + C0 = C0+4; + } + + if (bm & 1) + { +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*2; + ptrbb = bb + off*2; +#endif + vres0 = VFMVVF_FLOAT(0.0, vlmax); + vres1 = VFMVVF_FLOAT(0.0, vlmax); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off + 1; +#else + temp = off + 1; +#endif + + for (k = temp; k > 0; k -= vl) + { + vl = VSETVL(k); + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); + +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl); + vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl); + vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl); + +#endif + ptrba += vl * 2; + ptrbb += vl * 2; + + } + + v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax); + v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax); + res0 = VFMVFS_FLOAT_M1(v_m1_res0); + res1 = VFMVFS_FLOAT_M1(v_m1_res1); + + C0[0] = res0 * alphar - res1 * alphai; + C0[1] = res1 * alphar + res0 * alphai; + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= 1; +#else + temp -= 1; +#endif + ptrba += temp*2; + ptrbb += temp*2; + +#endif +#ifdef LEFT + off += 1; +#endif + C0 = C0+2; + } + k = (bk<<1); + bb = bb+k; + i = (ldc<<1); + C = C+i; + } + return 0; +} diff --git a/param.h b/param.h index 514b13a3a..62b675d6c 100644 --- a/param.h +++ b/param.h @@ -3038,6 +3038,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(x280) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 // 4 // 16 // 2 +#define SGEMM_DEFAULT_UNROLL_N 8// 4 // 4 // 2 + +/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) + * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. + * If VLMAX size is ever more than 1024, this should be increased also. */ +#define SGEMM_DEFAULT_UNROLL_MN 32 + +#define DGEMM_DEFAULT_UNROLL_M 16 //2 // 8 +#define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4 +#define DGEMM_DEFAULT_UNROLL_MN 32 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 160 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif #ifdef C910V #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 From 5d0d1c555195a391fe5d029427dfbf7b942ecdf9 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Tue, 15 Nov 2022 18:22:21 -0800 Subject: [PATCH 004/718] Remove redundant files --- Makefile.install | 5 - kernel/riscv64/KERNEL.x280 | 36 +- kernel/riscv64/gemm_ncopy_2_rvv.c | 92 --- kernel/riscv64/gemm_ncopy_4_rvv.c | 123 ---- kernel/riscv64/gemm_tcopy_2_rvv.c | 108 ---- kernel/riscv64/gemm_tcopy_4_rvv.c | 236 -------- kernel/riscv64/gemmkernel_2x2_rvv.c | 214 ------- kernel/riscv64/gemmkernel_4x4_rvv.c | 508 ---------------- kernel/riscv64/trmmkernel_2x2_rvv.c | 342 ----------- kernel/riscv64/trmmkernel_4x4_rvv.c | 881 ---------------------------- 10 files changed, 2 insertions(+), 2543 deletions(-) delete mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c delete mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c delete mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c delete mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c delete mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c delete mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c delete mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c delete mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c diff --git a/Makefile.install b/Makefile.install index f1adaa271..168d08f72 100644 --- a/Makefile.install +++ b/Makefile.install @@ -8,7 +8,6 @@ PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin -OPENBLAS_RELEASE_DIR := $(PREFIX)/release OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake @@ -39,7 +38,6 @@ install : lib.grd @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" - @-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @@ -204,8 +202,5 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! -#Generating release tar - @echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz - @tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release . diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 2eb60f2b4..4d64354fb 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -122,23 +122,7 @@ CTRMMKERNEL = ztrmmkernel_2x2_rvv.c ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c # SGEMM_UNROLL_N set in params.h -ifeq ($(SGEMM_UNROLL_N), 2) -SGEMMKERNEL = gemmkernel_2x2_rvv.c -SGEMMONCOPY = gemm_ncopy_2_rvv.c -SGEMMOTCOPY = gemm_tcopy_2_rvv.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -STRMMKERNEL = trmmkernel_2x2_rvv.c -else ifeq ($(SGEMM_UNROLL_N), 4) -SGEMMKERNEL = gemmkernel_4x4_rvv.c -SGEMMONCOPY = gemm_ncopy_4_rvv.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -STRMMKERNEL = trmmkernel_4x4_rvv.c -else ifeq ($(SGEMM_UNROLL_N), 8) +ifeq ($(SGEMM_UNROLL_N), 8) # UNROLL_M is VLMAX SGEMMKERNEL = gemmkernel_rvv_v1x8.c SGEMMINCOPY = gemm_ncopy_rvv_v1.c @@ -162,23 +146,7 @@ SSYMMLCOPY_M = symm_lcopy_rvv_v1.c endif # SGEMM_UNROLL_N set in params.h -ifeq ($(DGEMM_UNROLL_N), 2) -DGEMMKERNEL = gemmkernel_2x2_rvv.c -DGEMMONCOPY = gemm_ncopy_2_rvv.c -DGEMMOTCOPY = gemm_tcopy_2_rvv.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -DTRMMKERNEL = trmmkernel_2x2_rvv.c -else ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMKERNEL = gemmkernel_4x4_rvv.c -DGEMMONCOPY = gemm_ncopy_4_rvv.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -DTRMMKERNEL = trmmkernel_4x4_rvv.c -else ifeq ($(DGEMM_UNROLL_N), 8) +ifeq ($(DGEMM_UNROLL_N), 8) # UNROLL_M is VLMAX DGEMMKERNEL = gemmkernel_rvv_v1x8.c DGEMMINCOPY = gemm_ncopy_rvv_v1.c diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c deleted file mode 100644 index 5f55bc349..000000000 --- a/kernel/riscv64/gemm_ncopy_2_rvv.c +++ /dev/null @@ -1,92 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEG2_FLOAT vsseg2e32_v_f32m4 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEG2_FLOAT vsseg2e64_v_f64m4 -#endif - -// Optimizes the implementation in ../generic/gemm_ncopy_2.c - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) -{ - BLASLONG i, j; - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset; - FLOAT_V_T v1, v2; - size_t vl; - - //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU - - a_offset = a; - b_offset = b; - - for(j = (n >> 1); j > 0; j--) { - - a_offset1 = a_offset; - a_offset2 = a_offset + lda; - a_offset += 2 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - VSSEG2_FLOAT(b_offset, v1, v2, vl); - - a_offset1 += vl; - a_offset2 += vl; - b_offset += vl*2; - } - } - - if (n & 1) { - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset, vl); - VSEV_FLOAT(b_offset, v1, vl); - - a_offset += vl; - b_offset += vl; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c deleted file mode 100644 index 4d4efe4c9..000000000 --- a/kernel/riscv64/gemm_ncopy_4_rvv.c +++ /dev/null @@ -1,123 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG4_FLOAT vsseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG4_FLOAT vsseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_ncopy_4.c - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) -{ - BLASLONG i, j; - - FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - FLOAT *b_offset; - - FLOAT_V_T v1, v2, v3, v4; - size_t vl; - - //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); - - a_offset = a; - b_offset = b; - - for(j = (n >> 2); j > 0; j--) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 4 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - v3 = VLEV_FLOAT(a_offset3, vl); - v4 = VLEV_FLOAT(a_offset4, vl); - - VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); - - a_offset1 += vl; - a_offset2 += vl; - a_offset3 += vl; - a_offset4 += vl; - b_offset += vl*4; - } - } - - if (n & 2) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - - VSSEG2_FLOAT(b_offset, v1, v2, vl); - - a_offset1 += vl; - a_offset2 += vl; - b_offset += vl*2; - } - } - - if (n & 1) { - a_offset1 = a_offset; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - - VSEV_FLOAT(b_offset, v1, vl); - - a_offset1 += vl; - b_offset += vl; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c deleted file mode 100644 index 963e1be69..000000000 --- a/kernel/riscv64/gemm_tcopy_2_rvv.c +++ /dev/null @@ -1,108 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_tcopy_2.c - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) -{ - BLASLONG i, j; - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset, *b_offset1, *b_offset2; - FLOAT_V_T v1a, v1b, v2a, v2b; - size_t vl; - - //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU - - a_offset = a; - b_offset = b; - b_offset2 = b + m * (n & ~1); - - for(i = (m >> 1); i > 0; i--) { - - a_offset1 = a_offset; - a_offset2 = a_offset + lda; - a_offset += 2 * lda; - - b_offset1 = b_offset; - b_offset += 4; - - for(j = (n >> 1); j > 0; j -= vl) { - vl = VSETVL(j); - - VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl); - VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl); - - VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl); - - a_offset1 += vl * 2; - a_offset2 += vl * 2; - b_offset1 += vl * m * 2; - } - - if (n & 1) { - *(b_offset2 + 0) = *(a_offset1 + 0); - *(b_offset2 + 1) = *(a_offset2 + 0); - b_offset2 += 2; - } - } - - if (m & 1) { - - for(j = (n >> 1); j > 0; j -= vl) { - vl = VSETVL(j); - - VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl); - - VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl); - - a_offset += vl * 2; - b_offset += vl * m * 2; - } - - if (n & 1){ - *(b_offset2 + 0) = *(a_offset + 0); - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c deleted file mode 100644 index ac9974b24..000000000 --- a/kernel/riscv64/gemm_tcopy_4_rvv.c +++ /dev/null @@ -1,236 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_tcopy_4.c - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) -{ - BLASLONG i, j; - - FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; - FLOAT ctemp1, ctemp2, ctemp3, ctemp4; - FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - FLOAT ctemp9, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - - //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); - - a_offset = a; - b_offset = b; - - b_offset2 = b + m * (n & ~3); - b_offset3 = b + m * (n & ~1); - - for(j = (m >> 2); j > 0; j--) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 4 * lda; - - b_offset1 = b_offset; - b_offset += 16; - - for(i = (n >> 2); i > 0; i--) { - v1 = VLEV_FLOAT(a_offset1, 4); - v2 = VLEV_FLOAT(a_offset2, 4); - v3 = VLEV_FLOAT(a_offset3, 4); - v4 = VLEV_FLOAT(a_offset4, 4); - - a_offset1 += 4; - a_offset2 += 4; - a_offset3 += 4; - a_offset4 += 4; - - VSEV_FLOAT(b_offset1, v1, 4); - VSEV_FLOAT(b_offset2+4, v2, 4); - VSEV_FLOAT(b_offset2+8, v3, 4); - VSEV_FLOAT(b_offset2+12, v4, 4); - - b_offset1 += m * 4; - } - - if (n & 2) { - v1 = VLEV_FLOAT(a_offset1, 2); - v2 = VLEV_FLOAT(a_offset2, 2); - v3 = VLEV_FLOAT(a_offset3, 2); - v4 = VLEV_FLOAT(a_offset4, 2); - - a_offset1 += 2; - a_offset2 += 2; - a_offset3 += 2; - a_offset4 += 2; - - VSEV_FLOAT(b_offset2, v1, 2); - VSEV_FLOAT(b_offset2+2, v2, 2); - VSEV_FLOAT(b_offset2+4, v3, 2); - VSEV_FLOAT(b_offset2+6, v4, 2); - - b_offset2 += 8; - } - - if (n & 1) { - v1 = VLEV_FLOAT(a_offset1, 1); - v2 = VLEV_FLOAT(a_offset2, 1); - v3 = VLEV_FLOAT(a_offset3, 1); - v4 = VLEV_FLOAT(a_offset4, 1); - - VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1); - - b_offset3 += 4; - } - - } - -// TODO cleanup - - if (m & 2){ - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - - b_offset1 = b_offset; - b_offset += 8; - - i = (n >> 2); - if (i > 0){ - do{ - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - ctemp3 = *(a_offset1 + 2); - ctemp4 = *(a_offset1 + 3); - - ctemp5 = *(a_offset2 + 0); - ctemp6 = *(a_offset2 + 1); - ctemp7 = *(a_offset2 + 2); - ctemp8 = *(a_offset2 + 3); - - a_offset1 += 4; - a_offset2 += 4; - - *(b_offset1 + 0) = ctemp1; - *(b_offset1 + 1) = ctemp2; - *(b_offset1 + 2) = ctemp3; - *(b_offset1 + 3) = ctemp4; - - *(b_offset1 + 4) = ctemp5; - *(b_offset1 + 5) = ctemp6; - *(b_offset1 + 6) = ctemp7; - *(b_offset1 + 7) = ctemp8; - - b_offset1 += m * 4; - i --; - }while(i > 0); - } - - if (n & 2) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - - ctemp3 = *(a_offset2 + 0); - ctemp4 = *(a_offset2 + 1); - - a_offset1 += 2; - a_offset2 += 2; - - *(b_offset2 + 0) = ctemp1; - *(b_offset2 + 1) = ctemp2; - *(b_offset2 + 2) = ctemp3; - *(b_offset2 + 3) = ctemp4; - - b_offset2 += 4; - } - - if (n & 1) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset2 + 0); - - *(b_offset3 + 0) = ctemp1; - *(b_offset3 + 1) = ctemp2; - b_offset3 += 2; - } - } - - if (m & 1){ - a_offset1 = a_offset; - b_offset1 = b_offset; - - i = (n >> 2); - if (i > 0){ - do{ - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - ctemp3 = *(a_offset1 + 2); - ctemp4 = *(a_offset1 + 3); - - a_offset1 += 4; - - *(b_offset1 + 0) = ctemp1; - *(b_offset1 + 1) = ctemp2; - *(b_offset1 + 2) = ctemp3; - *(b_offset1 + 3) = ctemp4; - - b_offset1 += 4 * m; - - i --; - }while(i > 0); - } - - if (n & 2) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - a_offset1 += 2; - - *(b_offset2 + 0) = ctemp1; - *(b_offset2 + 1) = ctemp2; - } - - if (n & 1) { - ctemp1 = *(a_offset1 + 0); - *(b_offset3 + 0) = ctemp1; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c deleted file mode 100644 index ec8961ced..000000000 --- a/kernel/riscv64/gemmkernel_2x2_rvv.c +++ /dev/null @@ -1,214 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEG2_FLOAT vlseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEG2_FLOAT vlseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - -// Optimizes the implementation in ../generic/gemm_kernel_2x2.c - -int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1; - IFLOAT *ptrba,*ptrbb; - - //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); - - FLOAT_V_T va0, va1, vb0, vb1; - FLOAT_V_T vres0, vres1, vres2, vres3; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; - FLOAT_V_T_M1 v_z0; - - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vlmax = VSETVL_MAX; - size_t vl; - - for (j = bn/2; j > 0; j--) { - C0 = C; - C1 = C0 + ldc; - ptrba = ba; - - for (i = bm/2; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl*2; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 2; - C1 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 1; - C1 += 1; - } - - bb += (bk<<1); - C += (ldc<<1); - } - - if(bn & 1) { - C0 = C; - ptrba = ba; - for (i = bm/2; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - - ptrba += vl*2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - - C0 += 1; - } - - bb += (bk<<0); - C += ldc; - } - - return 0; -} diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c deleted file mode 100644 index aa58bcc76..000000000 --- a/kernel/riscv64/gemmkernel_4x4_rvv.c +++ /dev/null @@ -1,508 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m1_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VLSEG2_FLOAT vlseg2e32_v_f32m1 -#define VLSEG4_FLOAT vlseg4e32_v_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m1 -#define VFMACCVF_FLOAT vfmacc_vf_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m1 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m1(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m1_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VLSEG2_FLOAT vlseg2e64_v_f64m1 -#define VLSEG4_FLOAT vlseg4e64_v_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m1 -#define VFMACCVF_FLOAT vfmacc_vf_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m1 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - -// Optimizes the implementation in ../generic/gemm_kernel_2x2.c - -int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3; - IFLOAT *ptrba,*ptrbb; - - //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU - - FLOAT_V_T va0, va1, va2, va3; - FLOAT_V_T vb0, vb1, vb2, vb3; - FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; - FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; - FLOAT_V_T_M1 v_z0; - - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vlmax = VSETVL_MAX; - size_t vl; - - for (j = bn/4; j > 0; j--) { - C0 = C; - C1 = C0 + ldc; - C2 = C1 + ldc; - C3 = C2 + ldc; - ptrba = ba; - - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - vres8 = VFMVVF_FLOAT(0.0, vlmax); - vres9 = VFMVVF_FLOAT(0.0, vlmax); - vres10 = VFMVVF_FLOAT(0.0, vlmax); - vres11 = VFMVVF_FLOAT(0.0, vlmax); - vres12 = VFMVVF_FLOAT(0.0, vlmax); - vres13 = VFMVVF_FLOAT(0.0, vlmax); - vres14 = VFMVVF_FLOAT(0.0, vlmax); - vres15 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); - vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); - vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); - - vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl); - vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl); - vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl); - vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl); - - vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl); - vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl); - vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl); - vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl); - - ptrba += vl*4; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - C1 += 4; - C2 += 4; - C3 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); - vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); - vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); - - ptrba += vl*2; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - C1 += 2; - C2 += 2; - C3 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); - vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); - - ptrba += vl; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 1; - C1 += 1; - C2 += 1; - C3 += 1; - } - - bb += (bk<<2); - C += (ldc<<2); - } - - if(bn & 2) { - - C0 = C; - C1 = C0 + ldc; - ptrba = ba; - - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); - vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl); - vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl); - vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl); - - ptrba += vl*4; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - C1 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl*2; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 2; - C1 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 1; - C1 += 1; - } - - bb += (bk<<1); - C += (ldc<<1); - } - - if(bn & 1) { - C0 = C; - ptrba = ba; - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); - vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); - - ptrba += vl*4; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - - ptrba += vl*2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - - C0 += 1; - } - - bb += (bk<<0); - C += ldc; - } - - return 0; -} diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c deleted file mode 100644 index 127e76970..000000000 --- a/kernel/riscv64/trmmkernel_2x2_rvv.c +++ /dev/null @@ -1,342 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - - -// Optimizes the implementation in ../generic/trmmkernel_2x2.c - - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - BLASLONG off, temp; - - FLOAT_V_T va0, va1, vb0, vb1; - FLOAT_V_T vres0, vres1, vres2, vres3; - FLOAT_V_T_M1 v_res, v_z0; - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vl; - size_t vlmax = VSETVL_MAX; - -#if defined(TRMMKERNEL) && !defined(LEFT) - off = -offset; -#else - off = 0; -#endif - - for (j = bn/2; j > 0; j--) - { - C0 = C; - C1 = C0+ldc; -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - ptrba = ba; - - for (i = bm/2; i > 0; i--) - { -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || \ - (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; -#else - temp = off+2; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, ptrba, vl); - VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl * 2; - ptrbb += vl * 2; - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax); - C1[1] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; -#else - temp -= 2; -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif -#ifdef LEFT - off += 2; -#endif - C0 = C0+2; - C1 = C1+2; - } - - if (bm & 1) - { -#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off; - ptrbb = bb+off*2; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; -#else - temp = off+2; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl * 2; - - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk-off; -#ifdef LEFT - temp -= 1; -#else - temp -= 2; -#endif - ptrba += temp; - ptrbb += temp*2; -#endif -#ifdef LEFT - off += 1; -#endif - C0 = C0+1; - C1 = C1+1; - } -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - if (bn & 1) - { - C0 = C; -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - ptrba = ba; - - for (i = bm/2; i > 0; i--) - { -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off; -#endif - - -#if (defined(LEFT) && !defined(TRANSA)) || \ - (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; -#else - temp = off+1; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - vb0 = VLEV_FLOAT(ptrbb, vl); - VLSEG_FLOAT(&va0, &va1, ptrba, vl); - - vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); - vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl); - - ptrba += vl * 2; - ptrbb += vl; - - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; -#else - temp -= 1; -#endif - ptrba += temp*2; - ptrbb += temp; -#endif -#ifdef LEFT - off += 2; -#endif - - C0 = C0+2; - } - - if (bm & 1) - { -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off; - ptrbb = bb+off; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off + 1; -#else - temp = off + 1; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); - ptrba += vl; - ptrbb += vl; - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk-off; -#ifdef LEFT - temp -= 1; -#else - temp -= 1; -#endif - ptrba += temp; - ptrbb += temp; -#endif -#ifdef LEFT - off += 1; -#endif - C0 = C0+1; - } -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} - diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c deleted file mode 100644 index 3e46c6348..000000000 --- a/kernel/riscv64/trmmkernel_4x4_rvv.c +++ /dev/null @@ -1,881 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m2_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEG4_FLOAT vlseg4e32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMUL_FLOAT vfmul_vv_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMACCVV_FLOAT vfmacc_vv_f32m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m2_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEG4_FLOAT vlseg4e64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMUL_FLOAT vfmul_vv_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMACCVV_FLOAT vfmacc_vv_f64m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - - -// Optimizes the implementation in ../generic/trmmkernel_4x4.c - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; - - FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0; - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vl; - size_t vlmax = VSETVL_MAX; - - FLOAT_V_T vres0_0; - FLOAT_V_T vres0_1; - FLOAT_V_T vres0_2; - FLOAT_V_T vres0_3; - - FLOAT_V_T vres1_0; - FLOAT_V_T vres1_1; - FLOAT_V_T vres1_2; - FLOAT_V_T vres1_3; - - FLOAT_V_T vres2_0; - FLOAT_V_T vres2_1; - FLOAT_V_T vres2_2; - FLOAT_V_T vres2_3; - - FLOAT_V_T vres3_0; - FLOAT_V_T vres3_1; - FLOAT_V_T vres3_2; - FLOAT_V_T vres3_3; - - BLASLONG off, temp; - - bool left; - bool transposed; - bool backwards; - -#ifdef LEFT - left = true; -#else - left = false; -#endif - -#ifdef TRANSA - transposed = true; -#else - transposed = false; -#endif - - backwards = left != transposed; - - if (!left) { - off = -offset; - } - - - for (j=0; j 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); - vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); - vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl); - vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); - vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl); - vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl); - - ptrba += vl * 4; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax); - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - if (!backwards) { - temp = bk-off; - temp = left ? temp - 4 : // number of values in A - temp - 4; // number of values in B - - ptrba += temp*4; // number of values in A - ptrbb += temp*4; // number of values in B - } -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - C2 = C2+4; - C3 = C3+4; - - } - - if ( bm & 2 ) // do any 2x4 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*4; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres1_1 = VFMVVF_FLOAT(0, vlmax); - - vres2_0 = VFMVVF_FLOAT(0, vlmax); - vres2_1 = VFMVVF_FLOAT(0, vlmax); - - vres3_0 = VFMVVF_FLOAT(0, vlmax); - vres3_1 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+4; // number of values in B -#endif - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); - vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); - - ptrba += vl * 2; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax); - - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 4; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*4; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - C2 = C2+2; - C3 = C3+2; - - } - - if ( bm & 1 ) // do any 1x4 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*4; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres2_0 = VFMVVF_FLOAT(0, vlmax); - vres3_0 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+4; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - ptrba += vl; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 4; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*4; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - C2 = C2+1; - C3 = C3+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 4; -#endif - - k = (bk<<2); - bb = bb+k; - i = (ldc<<2); - C = C+i; - } - - for (j=0; j<(bn&2); j+=2) // do the Mx2 loops - { - C0 = C; - C1 = C0+ldc; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); - - ptrba += vl * 4; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - - } - - if ( bm & 2 ) // do any 2x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres1_1 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+2; // number of values in B -#endif - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - - ptrba += vl * 2; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - - } - - if ( bm & 1 ) // do any 1x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*2; -#endif - - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres1_0 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+2; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - for (j=0; j<(bn&1); j+=1) // do the Mx1 loops - { - C0 = C; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - - ptrba += vl * 4; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - - } - - if ( bm & 2 ) // do any 2x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*1; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - - ptrba += vl * 2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - - } - - if ( bm & 1 ) // do any 1x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*1; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - - } - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} From 9702d57b11351a5360a2f0326c69c3f550c784d2 Mon Sep 17 00:00:00 2001 From: HellerZheng Date: Wed, 16 Nov 2022 11:11:04 +0800 Subject: [PATCH 005/718] Update Makefile.install --- Makefile.install | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile.install b/Makefile.install index 168d08f72..87b5bc870 100644 --- a/Makefile.install +++ b/Makefile.install @@ -202,5 +202,3 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! - - From 3918d8504e7720d94221025ae6078a2459ccb104 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 21 Nov 2022 19:06:07 -0800 Subject: [PATCH 006/718] nrm2 simple optimization --- kernel/riscv64/nrm2_rvv.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 3f5d50397..979c31648 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -39,9 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVV_FLOAT vfmacc_vv_f32m8 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT vfabs_v_f32m8 #define ABS fabsf #else #define VSETVL(n) vsetvl_e64m8(n) @@ -54,9 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVV_FLOAT vfmacc_vv_f64m8 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT vfabs_v_f64m8 #define ABS fabs #endif @@ -68,12 +64,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if(n == 1) return (ABS(x[0])); FLOAT_V_T vr, v0; - FLOAT_V_T_M1 v_max, v_res; - FLOAT scale = 0.0, ssq = 0.0; + FLOAT_V_T_M1 v_res; + FLOAT ssq = 0.0; size_t vlmax = VSETVL_MAX; v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(0, vlmax); vr = VFMVVF_FLOAT(0, vlmax); @@ -83,9 +78,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); v0 = VLEV_FLOAT(x, vl); - v0 = VFABSV_FLOAT(v0, vl); - - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); } @@ -98,20 +90,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); v0 = VLSEV_FLOAT(x, stride_x, vl); - v0 = VFABSV_FLOAT(v0, vl); - - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); } - } v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); ssq = VFMVFS_FLOAT_M1(v_res); - scale = VFMVFS_FLOAT_M1(v_max); - ssq = ssq / (scale*scale); - return(scale * sqrt(ssq)); + return sqrt(ssq); } From 387e8970cd8ce581a6c7bc48418860966140f621 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 28 Nov 2022 21:42:29 -0800 Subject: [PATCH 007/718] Fix merge problem; Update compiling COMMON_OPT per review comments. --- Makefile.prebuild | 2 +- Makefile.riscv64 | 6 +++--- common_riscv64.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index e6a8eab59..c4f4a2602 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -56,7 +56,7 @@ TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif ifeq ($(TARGET), x280) -TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -mcpu=sifive-x280 +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif ifeq ($(TARGET), RISCV64_GENERIC) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index d6eaf552d..d091984a6 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -3,10 +3,10 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif ifeq ($(CORE), x280) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -mllvm --riscv-v-vector-bits-min=512 -mcpu=sifive-x280 -ffast-math -FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -static +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static -endif \ No newline at end of file +endif diff --git a/common_riscv64.h b/common_riscv64.h index 221a79901..2092bd5ab 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -92,7 +92,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define SEEK_ADDRESS #if defined(C910V) -#include +#include #endif #if defined(x280) From c19dff0a31c58163dc386b0c4270e75f576b97be Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 25 Jan 2023 19:33:32 +0800 Subject: [PATCH 008/718] Fix T-Head RVV intrinsic API changes. --- kernel/riscv64/amax_vector.c | 8 ++++---- kernel/riscv64/amin_vector.c | 8 ++++---- kernel/riscv64/asum_vector.c | 8 ++++---- kernel/riscv64/axpby_vector.c | 16 ++++++++-------- kernel/riscv64/axpy_vector.c | 16 ++++++++-------- kernel/riscv64/copy_vector.c | 16 ++++++++-------- kernel/riscv64/dot_vector.c | 8 ++++---- kernel/riscv64/gemv_n_vector.c | 16 ++++++++-------- kernel/riscv64/gemv_t_vector.c | 8 ++++---- kernel/riscv64/iamax_vector.c | 8 ++++---- kernel/riscv64/iamin_vector.c | 8 ++++---- kernel/riscv64/imax_vector.c | 8 ++++---- kernel/riscv64/imin_vector.c | 8 ++++---- kernel/riscv64/izamax_vector.c | 4 ++-- kernel/riscv64/izamin_vector.c | 4 ++-- kernel/riscv64/max_vector.c | 8 ++++---- kernel/riscv64/min_vector.c | 8 ++++---- kernel/riscv64/nrm2_vector.c | 8 ++++---- kernel/riscv64/rot_vector.c | 16 ++++++++-------- kernel/riscv64/scal_vector.c | 16 ++++++++-------- kernel/riscv64/swap_vector.c | 16 ++++++++-------- kernel/riscv64/symv_L_vector.c | 16 ++++++++-------- kernel/riscv64/symv_U_vector.c | 16 ++++++++-------- kernel/riscv64/zamax_vector.c | 4 ++-- kernel/riscv64/zamin_vector.c | 4 ++-- kernel/riscv64/zasum_vector.c | 8 ++++---- kernel/riscv64/zaxpby_vector.c | 8 ++++---- kernel/riscv64/zaxpy_vector.c | 8 ++++---- kernel/riscv64/zcopy_vector.c | 8 ++++---- kernel/riscv64/zdot_vector.c | 8 ++++---- kernel/riscv64/zgemv_n_vector.c | 16 ++++++++-------- kernel/riscv64/zgemv_t_vector.c | 4 ++-- kernel/riscv64/zhemv_LM_vector.c | 8 ++++---- kernel/riscv64/zhemv_UV_vector.c | 8 ++++---- kernel/riscv64/znrm2_vector.c | 8 ++++---- kernel/riscv64/zrot_vector.c | 16 ++++++++-------- kernel/riscv64/zscal_vector.c | 8 ++++---- kernel/riscv64/zswap_vector.c | 16 ++++++++-------- 38 files changed, 190 insertions(+), 190 deletions(-) diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index b778d3e55..1b7799340 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index fd2f83dc9..f9b7defae 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index a82275153..fc73362bc 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index 988c57ec2..676dfd474 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 98b9f6814..6f921f2d6 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -30,18 +30,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c index a46136d6c..fee5e195d 100644 --- a/kernel/riscv64/copy_vector.c +++ b/kernel/riscv64/copy_vector.c @@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c index 64efc6c40..f47e0c0b5 100644 --- a/kernel/riscv64/dot_vector.c +++ b/kernel/riscv64/dot_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -45,8 +45,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index 32ca8618b..bb9ab8e5a 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c index 7683641fa..7d0b70cbb 100644 --- a/kernel/riscv64/gemv_t_vector.c +++ b/kernel/riscv64/gemv_t_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -46,8 +46,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c index ecb4cd7a9..9fea522f7 100644 --- a/kernel/riscv64/iamax_vector.c +++ b/kernel/riscv64/iamax_vector.c @@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -60,8 +60,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index c72bb94cc..4e81e7848 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index c2d787ab8..ca48a3c48 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 @@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index dfe9a3310..2a677098d 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 @@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index fdbdc3ae8..66a101566 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 59c720310..818193a9e 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index b988513c9..7f31e9a53 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 @@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index be0803df6..14b7e01ed 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 @@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c index 2a83e2a52..cf6fdb741 100644 --- a/kernel/riscv64/nrm2_vector.c +++ b/kernel/riscv64/nrm2_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32 #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -55,8 +55,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64 #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c index 9b48d1c69..43a65e552 100644 --- a/kernel/riscv64/rot_vector.c +++ b/kernel/riscv64/rot_vector.c @@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFMSACVF_FLOAT vfmsac_vf_f32m4 @@ -42,10 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFMSACVF_FLOAT vfmsac_vf_f64m4 diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c index 7a3153b7c..8b9ef5a3e 100644 --- a/kernel/riscv64/scal_vector.c +++ b/kernel/riscv64/scal_vector.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #define VFMULVF_FLOAT vfmul_vf_f32m8 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #define VFMULVF_FLOAT vfmul_vf_f64m8 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #endif diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index d9421e2f1..b16592808 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c index 6588f4dda..58ec17b03 100644 --- a/kernel/riscv64/symv_L_vector.c +++ b/kernel/riscv64/symv_L_vector.c @@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,10 +48,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 31104eae6..34ff0e30a 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 9dbeba90f..bfb282ae0 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index dc58075ac..d9eca7f10 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c index 8386ab62e..0d1cc42f1 100644 --- a/kernel/riscv64/zasum_vector.c +++ b/kernel/riscv64/zasum_vector.c @@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index 3eca20415..5e6034ac5 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 @@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 303d3541e..4ccfe4a81 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 #endif diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c index 600f02bba..55a480a35 100644 --- a/kernel/riscv64/zcopy_vector.c +++ b/kernel/riscv64/zcopy_vector.c @@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #endif diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index ec38ed9d2..0900206b3 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index b5ee1f054..3095c28f9 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -29,19 +29,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 #endif diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c index e930dc2a2..a7a8a5279 100644 --- a/kernel/riscv64/zgemv_t_vector.c +++ b/kernel/riscv64/zgemv_t_vector.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c index 275ee9131..0a284a999 100644 --- a/kernel/riscv64/zhemv_LM_vector.c +++ b/kernel/riscv64/zhemv_LM_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 2f46977d4..33b7c9c25 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index 59d0e219d..cadabdb75 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -53,8 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c index 2fdd8135a..858dfd173 100644 --- a/kernel/riscv64/zrot_vector.c +++ b/kernel/riscv64/zrot_vector.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 @@ -41,10 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 64323aa3a..d275b75f8 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 @@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c index 7550294b5..c1dcaccab 100644 --- a/kernel/riscv64/zswap_vector.c +++ b/kernel/riscv64/zswap_vector.c @@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) From 63cf4d01668f8f6c73a05039bc36785ba78b0940 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Wed, 1 Feb 2023 19:13:44 -0800 Subject: [PATCH 009/718] add riscv level3 C,Z kernel functions. --- kernel/riscv64/KERNEL.x280 | 85 +++- kernel/riscv64/trmm_lncopy_rvv_v1.c | 8 +- kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 644 +++--------------------- kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 658 +++---------------------- kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 610 +++-------------------- kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 623 +++-------------------- kernel/riscv64/zgemm_ncopy_4_rvv.c | 121 +++++ kernel/riscv64/zgemm_ncopy_rvv_v1.c | 74 +++ kernel/riscv64/zgemm_tcopy_4_rvv.c | 181 +++++++ kernel/riscv64/zgemm_tcopy_rvv_v1.c | 74 +++ kernel/riscv64/zgemmkernel_rvv_v1x4.c | 475 ++++++++++++++++++ kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 124 +++++ kernel/riscv64/zhemm_utcopy_rvv_v1.c | 120 +++++ kernel/riscv64/zsymm_lcopy_rvv_v1.c | 106 ++++ kernel/riscv64/zsymm_ucopy_rvv_v1.c | 106 ++++ kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 145 ++++++ kernel/riscv64/ztrmm_ltcopy_rvv_v1.c | 143 ++++++ kernel/riscv64/ztrmm_uncopy_rvv_v1.c | 144 ++++++ kernel/riscv64/ztrmm_utcopy_rvv_v1.c | 140 ++++++ kernel/riscv64/ztrmmkernel_rvv_v1x4.c | 574 +++++++++++++++++++++ kernel/riscv64/ztrsm_lncopy_rvv_v1.c | 115 +++++ kernel/riscv64/ztrsm_ltcopy_rvv_v1.c | 114 +++++ kernel/riscv64/ztrsm_uncopy_rvv_v1.c | 113 +++++ kernel/riscv64/ztrsm_utcopy_rvv_v1.c | 115 +++++ param.h | 10 +- 25 files changed, 3342 insertions(+), 2280 deletions(-) create mode 100644 kernel/riscv64/zgemm_ncopy_4_rvv.c create mode 100644 kernel/riscv64/zgemm_ncopy_rvv_v1.c create mode 100644 kernel/riscv64/zgemm_tcopy_4_rvv.c create mode 100644 kernel/riscv64/zgemm_tcopy_rvv_v1.c create mode 100644 kernel/riscv64/zgemmkernel_rvv_v1x4.c create mode 100644 kernel/riscv64/zhemm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/zhemm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/zsymm_lcopy_rvv_v1.c create mode 100644 kernel/riscv64/zsymm_ucopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmm_utcopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrmmkernel_rvv_v1x4.c create mode 100644 kernel/riscv64/ztrsm_lncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrsm_ltcopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrsm_uncopy_rvv_v1.c create mode 100644 kernel/riscv64/ztrsm_utcopy_rvv_v1.c diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 4d64354fb..217d8534e 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -118,8 +118,8 @@ DGEMVTKERNEL = gemv_t_rvv.c CGEMVTKERNEL = zgemv_t_rvv.c ZGEMVTKERNEL = zgemv_t_rvv.c -CTRMMKERNEL = ztrmmkernel_2x2_rvv.c -ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c +CTRMMKERNEL = ztrmmkernel_rvv_v1x4.c +ZTRMMKERNEL = ztrmmkernel_rvv_v1x4.c # SGEMM_UNROLL_N set in params.h ifeq ($(SGEMM_UNROLL_N), 8) @@ -168,17 +168,28 @@ DSYMMUCOPY_M = symm_ucopy_rvv_v1.c DSYMMLCOPY_M = symm_lcopy_rvv_v1.c endif -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMKERNEL = zgemmkernel_rvv_v1x4.c +CGEMMINCOPY = zgemm_ncopy_rvv_v1.c +CGEMMITCOPY = zgemm_tcopy_rvv_v1.c +CGEMMONCOPY = zgemm_ncopy_4_rvv.c +CGEMMOTCOPY = zgemm_tcopy_4_rvv.c -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemmkernel_rvv_v1x4.c + +ZGEMMINCOPY = zgemm_ncopy_rvv_v1.c +ZGEMMITCOPY = zgemm_tcopy_rvv_v1.c +ZGEMMONCOPY = zgemm_ncopy_4_rvv.c +ZGEMMOTCOPY = zgemm_tcopy_4_rvv.c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c @@ -190,20 +201,25 @@ DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +CTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +CTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +CTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c +ZTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c +ZTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c +ZTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c -TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c -TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c -TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c -TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c +TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c +TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c +TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c +TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_rvv_v1.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_rvv_v1.c +ZTRSMCOPYUN_M = ztrsm_uncopy_rvv_v1.c +ZTRSMCOPYUT_M = ztrsm_utcopy_rvv_v1.c SSYMV_U_KERNEL = symv_U_rvv.c SSYMV_L_KERNEL = symv_L_rvv.c @@ -214,6 +230,27 @@ CSYMV_L_KERNEL = ../generic/zsymv_k.c ZSYMV_U_KERNEL = ../generic/zsymv_k.c ZSYMV_L_KERNEL = ../generic/zsymv_k.c +ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c +ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c +CHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c + +ZSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c +ZSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c + +CSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c +CSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c + +ZTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c + +CTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c LSAME_KERNEL = ../generic/lsame.c diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c index 73a8233f8..3457ca3e1 100644 --- a/kernel/riscv64/trmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -36,10 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT vse32_v_f32m2 #define VLSEV_FLOAT vlse32_v_f32m2 #define VBOOL_T vbool16_t -#define UINT_V_T vint32m2_t -#define VID_V_UINT vid_v_i32m2 -#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 #define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c index 11a0398ca..2cba06b38 100644 --- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -31,28 +31,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m2(n) #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 #define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 #define VSSEV_FLOAT vsse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 #define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 #define VSSEV_FLOAT vsse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif @@ -88,606 +91,107 @@ static FLOAT dm1 = -1.; #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 - static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb; - FLOAT *pa, *pc; + FLOAT aa; + FLOAT* pc; int i, j, k; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug - size_t vl; - FLOAT_V_T va, vc; - - a += (m - 1) * m; - b += (m - 1) * n; - - for (i = m - 1; i >= 0; i--) - { - aa = *(a + i); - for (j = 0; j < n; j ++) - { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b ++; - - pa = a; - pc = c + j * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLEV_FLOAT(pc, vl); - va = VLEV_FLOAT(pa, vl); - vc = VFNMSACVF_FLOAT(vc, bb, va, vl); - VSEV_FLOAT(pc, vc, vl); - pa += vl; - pc += vl; - } - } - a -= m; - b -= 2 * n; - } - -} -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - FLOAT aa, bb0, bb1; - FLOAT *pa, *pc, *pc0, *pc1; - FLOAT *pb0, *pb1; - - int i, j, k; - fprintf(stderr, "%s , %s, m = %4ld n = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug + FLOAT_V_T vb, vc; size_t vl; - FLOAT_V_T va, vc0, vc1; a += (m - 1) * m; b += (m - 1) * n; - for (i = m - 1; i >= 0; i--) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/2; j ++) - { - //bb = *(c + i + j * ldc); - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - //bb *= aa; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - //*b = bb; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - - //*(c + i + j * ldc) = bb; - //b ++; - - b += 2; - //pa = a + i + 1; - pc0 = c + j * ldc * 2; - pc1 = pc0 + ldc; - pa = a; - //pc = c + j * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - } - } - pc += ldc * (n/2) * 2; - if (n & 1) - { - pb0 = pc; - bb0 = (*pb0) * aa; - *b = bb0; - *pb0 = bb0; - b += 1; - - pc0 = pc - i; - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - - pa += vl; - pc0 += vl; - } - } - - a -= m; - b -= 2 * n; - } - -} - -#elif GEMM_DEFAULT_UNROLL_N == 4 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb0, bb1, bb2, bb3; - FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3; - FLOAT *pb0, *pb1, *pb2, *pb3; - - int i, j, k; - - size_t vl; - FLOAT_V_T va, vc0, vc1, vc2, vc3; - - a += (m - 1) * m; - b += (m - 1) * n; - - for (i = m - 1; i >= 0; i--) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/4; j ++) - { - pb0 = pc + j * ldc * 4; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - - b += 4; - - pc0 = c + j * ldc * 4; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - } - } - pc += ldc * (n/4) * 4; - - if (n & 2) - { - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - - *b = bb0; - *(b+1) = bb1; - - *pb0 = bb0; - *pb1 = bb1; - - b += 2; - - pc0 = c + j * ldc * 2; - pc1 = pc0 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - - if (n & 1) - { - pb0 = pc; - bb0 = (*pb0) * aa; - *b = bb0; - *pb0 = bb0; - b += 1; - - pc0 = pc - i; - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - - pa += vl; - pc0 += vl; - } - } - - a -= m; - b -= 2 * n; - } - -} -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; - FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; - - int i, j, k; - - size_t vl; - FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - - a += (m - 1) * m; - b += (m - 1) * n; + for (i = m - 1; i >= 0; i--) { - for (i = m - 1; i >= 0; i--) - { aa = *(a + i); - pc = c + i; - for (j = 0; j < n/8; j ++) - { - pb0 = pc + j * ldc * 8; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - pb4 = pb3 + ldc; - pb5 = pb4 + ldc; - pb6 = pb5 + ldc; - pb7 = pb6 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - bb4 = (*pb4) * aa; - bb5 = (*pb5) * aa; - bb6 = (*pb6) * aa; - bb7 = (*pb7) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - *(b+4) = bb4; - *(b+5) = bb5; - *(b+6) = bb6; - *(b+7) = bb7; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - *pb4 = bb4; - *pb5 = bb5; - *pb6 = bb6; - *pb7 = bb7; - - b += 8; - - pc0 = c + j * ldc * 8; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - pc4 = pc3 + ldc; - pc5 = pc4 + ldc; - pc6 = pc5 + ldc; - pc7 = pc6 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - vc4 = VLEV_FLOAT(pc4, vl); - vc5 = VLEV_FLOAT(pc5, vl); - vc6 = VLEV_FLOAT(pc6, vl); - vc7 = VLEV_FLOAT(pc7, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); - vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); - vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); - vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - VSEV_FLOAT(pc4, vc4, vl); - VSEV_FLOAT(pc5, vc5, vl); - VSEV_FLOAT(pc6, vc6, vl); - VSEV_FLOAT(pc7, vc7, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - pc4 += vl; - pc5 += vl; - pc6 += vl; - pc7 += vl; - } - } - pc += ldc * (n/8) * 8; - - if (n & 4) - { - pb0 = pc + j * ldc * 4; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - - b += 4; - - pc0 = c + j * ldc * 4; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vb = VLSEV_FLOAT(pc + i, stride_ldc, vl); + vb = VFMULVF_FLOAT(vb, aa, vl); + VSEV_FLOAT(b, vb, vl); + VSSEV_FLOAT(pc + i, stride_ldc, vb, vl); + b += vl; + + for (k = 0; k < i; k ++) { + vc = VLSEV_FLOAT(pc + k, stride_ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl); + VSSEV_FLOAT(pc + k, stride_ldc, vc, vl); } - pc += ldc * 4; + pc += vl * ldc; } - - if (n & 2) - { - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - - *b = bb0; - *(b+1) = bb1; - - *pb0 = bb0; - *pb1 = bb1; - - b += 2; - - pc0 = c + j * ldc * 2; - pc1 = pc0 + ldc; - - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - - if (n & 1) - { - pb0 = pc; - bb0 = (*pb0) * aa; - *b = bb0; - *pb0 = bb0; - b += 1; - - pc0 = pc - i; - pa = a; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - - pa += vl; - pc0 += vl; - } - } - a -= m; b -= 2 * n; } } -#else -static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb; - - int i, j, k; - - a += (m - 1) * m; - b += (m - 1) * n; - - for (i = m - 1; i >= 0; i--) { - - aa = *(a + i); - - for (j = 0; j < n; j ++) { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b ++; - - for (k = 0; k < i; k ++){ - *(c + k + j * ldc) -= bb * *(a + k); - } - - } - a -= m; - b -= 2 * n; - } - -} - -#endif - #else static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - - int i, j, k; + FLOAT aa1, aa2; + FLOAT *pc; + int i, j, k; - ldc *= 2; - a += (m - 1) * m * 2; - b += (m - 1) * n * 2; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; - for (i = m - 1; i >= 0; i--) { + FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; + size_t vl; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; - aa1 = *(a + i * 2 + 0); - aa2 = *(a + i * 2 + 1); + for (i = m - 1; i >= 0; i--) { - for (j = 0; j < n; j ++) { - bb1 = *(c + i * 2 + 0 + j * ldc); - bb2 = *(c + i * 2 + 1 + j * ldc); + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = aa1 * bb2 - aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif + VSSEG2_FLOAT(b, vs1, vs2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + b += vl * 2; - - *(b + 0) = cc1; - *(b + 1) = cc2; - *(c + i * 2 + 0 + j * ldc) = cc1; - *(c + i * 2 + 1 + j * ldc) = cc2; - b += 2; - - for (k = 0; k < i; k ++){ + for (k = 0; k < i; k ++) { + VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); #ifndef CONJ - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); -#else - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); + vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - } - + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + } + pc += vl * ldc * 2; + } + a -= m * 2; + b -= 4 * n; } - a -= m * 2; - b -= 4 * n; - } - } + #endif diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c index 0380bd1bb..492a5631f 100644 --- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -31,28 +31,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m2(n) #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 #define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 #define VSSEV_FLOAT vsse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 #define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 #define VSSEV_FLOAT vsse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif @@ -87,468 +90,39 @@ static FLOAT dm1 = -1.; // Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ - FLOAT aa, bb; - FLOAT *pa, *pc; - int i, j, k; - size_t vl; - FLOAT_V_T va, vc; - for (i = 0; i < m; i++) - { - aa = *(a + i); - for (j = 0; j < n; j ++) - { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b++; - pa = a + i + 1; - pc = c + j * ldc + i + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLEV_FLOAT(pc, vl); - va = VLEV_FLOAT(pa, vl); - vc = VFNMSACVF_FLOAT(vc, bb, va, vl); - VSEV_FLOAT(pc, vc, vl); - pa += vl; - pc += vl; - } - } - a += m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb0, bb1; - FLOAT *pa, *pc, *pc0, *pc1; - FLOAT *pb0, *pb1; + FLOAT aa; + FLOAT* pc; int i, j, k; - size_t vl; - FLOAT_V_T va, vc0, vc1; - for (i = 0; i < m; i++) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/2; j ++) - { - pb0 = pc + j * ldc * 2; - pb1 = pb0 + ldc; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - b += 2; - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - pa += vl; - pc0 += vl; - pc1 += vl; - } - } - pc += ldc * (n/2) * 2; - if (n & 1) - { - pb0 = pc; - bb0 = *(pb0); - bb0 *= aa; - *b = bb0; - *(c + i) = bb0; - b++; - pa = a + i + 1; - pc0 = pb0 + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - pa += vl; - pc0 += vl; - } - } - - a += m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 4 -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ + BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - FLOAT aa, bb0, bb1, bb2, bb3; - FLOAT *pa, *pc; - FLOAT *pc0, *pc1, *pc2, *pc3; - FLOAT *pb0, *pb1, *pb2, *pb3; + FLOAT_V_T vb, vc; - int i, j, k; size_t vl; - FLOAT_V_T va; - FLOAT_V_T vc0, vc1, vc2, vc3; - for (i = 0; i < m; i++) - { - aa = *(a + i); - pc = c + i; - for (j = 0; j < n/4; j ++) - { - pb0 = pc; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - b += 4; - - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - - va = VLEV_FLOAT(pa, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - } - } - pc += ldc * (n/4) * 4; - - if (n & 2) - { - pb0 = pc; - pb1 = pb0 + ldc; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - b += 2; - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - if (n & 1) - { - pb0 = pc; - bb0 = *(pb0); - bb0 *= aa; - *b = bb0; - *(c + i) = bb0; - b++; - pa = a + i + 1; - pc0 = pb0 + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - pa += vl; - pc0 += vl; - } - } - - a += m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) -{ - - FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7; - FLOAT *pa, *pc; - FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7; + for (i = 0; i < m; i++) { - int i, j, k; - size_t vl; - FLOAT_V_T va; - FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - for (i = 0; i < m; i++) - { aa = *(a + i); - pc = c + i; - for (j = 0; j < n/8; j ++) - { - pb0 = pc + j * ldc * 8; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - pb4 = pb3 + ldc; - pb5 = pb4 + ldc; - pb6 = pb5 + ldc; - pb7 = pb6 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - bb4 = (*pb4) * aa; - bb5 = (*pb5) * aa; - bb6 = (*pb6) * aa; - bb7 = (*pb7) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - *(b+4) = bb4; - *(b+5) = bb5; - *(b+6) = bb6; - *(b+7) = bb7; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - *pb4 = bb4; - *pb5 = bb5; - *pb6 = bb6; - *pb7 = bb7; - b += 8; - - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - pc4 = pc3 + ldc; - pc5 = pc4 + ldc; - pc6 = pc5 + ldc; - pc7 = pc6 + ldc; - - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - vc4 = VLEV_FLOAT(pc4, vl); - vc5 = VLEV_FLOAT(pc5, vl); - vc6 = VLEV_FLOAT(pc6, vl); - vc7 = VLEV_FLOAT(pc7, vl); - - va = VLEV_FLOAT(pa, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl); - vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl); - vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl); - vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl); - - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - VSEV_FLOAT(pc4, vc4, vl); - VSEV_FLOAT(pc5, vc5, vl); - VSEV_FLOAT(pc6, vc6, vl); - VSEV_FLOAT(pc7, vc7, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; - pc4 += vl; - pc5 += vl; - pc6 += vl; - pc7 += vl; - } - } - pc += ldc * (n/8) * 8; - - if (n & 4) - { - pb0 = pc; - pb1 = pb0 + ldc; - pb2 = pb1 + ldc; - pb3 = pb2 + ldc; - - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - bb2 = (*pb2) * aa; - bb3 = (*pb3) * aa; - - *b = bb0; - *(b+1) = bb1; - *(b+2) = bb2; - *(b+3) = bb3; - - *pb0 = bb0; - *pb1 = bb1; - *pb2 = bb2; - *pb3 = bb3; - b += 4; - - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - pc2 = pc1 + ldc; - pc3 = pc2 + ldc; - - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - vc2 = VLEV_FLOAT(pc2, vl); - vc3 = VLEV_FLOAT(pc3, vl); - - va = VLEV_FLOAT(pa, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl); - vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl); - - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - VSEV_FLOAT(pc2, vc2, vl); - VSEV_FLOAT(pc3, vc3, vl); - - pa += vl; - pc0 += vl; - pc1 += vl; - pc2 += vl; - pc3 += vl; + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + vb = VLSEV_FLOAT(pc + i, stride_ldc, vl); + vb = VFMULVF_FLOAT(vb, aa, vl); + VSEV_FLOAT(b, vb, vl); + VSSEV_FLOAT(pc + i, stride_ldc, vb, vl); + b += vl; + + for (k = i + 1; k < m; k++) { + vc = VLSEV_FLOAT(pc + k, stride_ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl); + VSSEV_FLOAT(pc + k, stride_ldc, vc, vl); } - pc += ldc * 4; + pc += vl * ldc; } - - if (n & 2) - { - pb0 = pc; - pb1 = pb0 + ldc; - bb0 = (*pb0) * aa; - bb1 = (*pb1) * aa; - *b = bb0; - *(b+1) = bb1; - *pb0 = bb0; - *pb1 = bb1; - b += 2; - pa = a + i + 1; - pc0 = pb0 + 1; - pc1 = pc0 + ldc; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - vc1 = VLEV_FLOAT(pc1, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - VSEV_FLOAT(pc1, vc1, vl); - pa += vl; - pc0 += vl; - pc1 += vl; - } - pc += ldc * 2; - } - - if (n & 1) - { - pb0 = pc; - bb0 = *(pb0); - bb0 *= aa; - *b = bb0; - *(c + i) = bb0; - b++; - pa = a + i + 1; - pc0 = pb0 + 1; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLEV_FLOAT(pc0, vl); - va = VLEV_FLOAT(pa, vl); - vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl); - VSEV_FLOAT(pc0, vc0, vl); - pa += vl; - pc0 += vl; - } - } - a += m; } } @@ -557,146 +131,60 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - - int i, j, k; - - for (i = 0; i < m; i++) { - - aa = *(a + i); - - for (j = 0; j < n; j ++) { - bb = *(c + i + j * ldc); - bb *= aa; - *b = bb; - *(c + i + j * ldc) = bb; - b ++; - - for (k = i + 1; k < m; k ++){ - *(c + k + j * ldc) -= bb * *(a + k); - } - - } - a += m; - } -} - -#endif - -#else - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - - int i, j, k; - - ldc *= 2; - - for (i = 0; i < m; i++) { - - aa1 = *(a + i * 2 + 0); - aa2 = *(a + i * 2 + 1); - - for (j = 0; j < n; j ++) { - bb1 = *(c + i * 2 + 0 + j * ldc); - bb2 = *(c + i * 2 + 1 + j * ldc); - -#ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; -#else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = aa1 * bb2 - aa2 * bb1; -#endif - - *(b + 0) = cc1; - *(b + 1) = cc2; - *(c + i * 2 + 0 + j * ldc) = cc1; - *(c + i * 2 + 1 + j * ldc) = cc2; - b += 2; - - for (k = i + 1; k < m; k ++){ -#ifndef CONJ - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); -#else - *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); - *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); -#endif - } - - } - a += m * 2; - } -} - - -static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - FLOAT *pa, *pc; - - int i, j, k; - - size_t vl; - FLOAT_V_T va0, va1, vc0, vc1; + FLOAT aa1, aa2; + FLOAT *pc; + int i, j, k; - ldc *= 2; + BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; - for (i = 0; i < m; i++) { + FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; + size_t vl; - aa1 = *(a + i * 2 + 0); - aa2 = *(a + i * 2 + 1); + ldc *= 2; - for (j = 0; j < n; j ++) { - bb1 = *(c + i * 2 + 0 + j * ldc); - bb2 = *(c + i * 2 + 1 + j * ldc); + for (i = 0; i < m; i++) { + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + pc = c; + for (j = n; j > 0; j -= vl) { + vl = VSETVL(j); + VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFMACCVF_FLOAT(vs2, aa2, vb1, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = aa1 * bb2 - aa2 * bb1; + vs1 = VFMULVF_FLOAT(vb1, aa1, vl); + vs1 = VFMACCVF_FLOAT(vs1, aa2, vb2, vl); + vs2 = VFMULVF_FLOAT(vb2, aa1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif + VSSEG2_FLOAT(b, vs1, vs2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + b += vl * 2; - *(b + 0) = cc1; - *(b + 1) = cc2; - *(c + i * 2 + 0 + j * ldc) = cc1; - *(c + i * 2 + 1 + j * ldc) = cc2; - b += 2; - - pa = a + (i + 1) * 2; - pc = c + j * ldc + (i + 1) * 2; - for (k = (m - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, pa, vl); - VLSEG2_FLOAT(&vc0, &vc1, pc, vl); + for (k = i + 1; k < m; k++) { + VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); #ifndef CONJ - vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); - vc0 = VFMACCVF_FLOAT(vc0, cc2, va1); - vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1); - vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); -#else - vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0); - vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1); - vc1 = VFMACCVF_FLOAT(vc1, cc1, va1); - vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0); + vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); +#else + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pc, vc0, vc1, vl); - pa += vl * 2; - pc += vl * 2; + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + } + pc += vl * ldc * 2; } - } + + a += m * 2; } - a += m * 2; - } } #endif @@ -714,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, size_t vl = VSETVL_MAX; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug j = (n >> GEMM_UNROLL_N_SHIFT); diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c index 41368be60..4751ae012 100644 --- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -32,28 +32,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t #define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 #define VSSEV_FLOAT vsse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t #define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 #define VSSEV_FLOAT vsse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif - static FLOAT dm1 = -1.; #ifdef CONJ @@ -86,569 +90,99 @@ static FLOAT dm1 = -1.; #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 - static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - FLOAT *pb, *pc; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc; - - for (i = 0; i < n; i++) - { - bb = *(b + i); - - for (j = 0; j < m; j ++) - { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - pb = b + i + 1; - pc = c + j + (i + 1) *ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLSEV_FLOAT(pc, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); - VSSEV_FLOAT(pc, stride_ldc, vc, vl); - pb += vl; - pc ++; - } - } - b += n; - } -} - -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT bb; + FLOAT *pci, *pcj; - FLOAT aa0, aa1, bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pc0, *pc1; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1; - - for (i = 0; i < n; i++) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/2; j ++) - { - pa0 = pc + j * 2; - pa1 = pc + j * 2 + 1; - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - pb += vl; - pc0++; - pc1++; - } - } - pc += (m/2)*2; - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b + i + 1; - pc0 = pa0 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b += n; - } -} - -#elif GEMM_DEFAULT_UNROLL_N == 4 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT_V_T va, vc; - FLOAT bb; - FLOAT aa0, aa1, aa2, aa3; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3; - FLOAT *pc0, *pc1, *pc2, *pc3; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3; + for (i = 0; i < n; i++) { - for (i = 0; i < n; i++) - { bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/4; j ++) - { - pa0 = pc + j * 4; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - - a += 4; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - pc2 = pa2 + ldc; - pc3 = pa3 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - } - pc += (m/4)*4; - - if (m & 2) - { - pa0 = pc; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - - a += 2; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b + i + 1; - pc0 = pa0 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; + pci = c + i * ldc; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + va = VLEV_FLOAT(pci, vl); + va = VFMULVF_FLOAT(va, bb, vl); + VSEV_FLOAT(a, va, vl); + VSEV_FLOAT(pci, va, vl); + a += vl; + pci += vl; + for (k = i + 1; k < n; k ++){ + vc = VLEV_FLOAT(pcj + k * ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl); + VSEV_FLOAT(pcj + k * ldc, vc, vl); } + pcj += vl; } b += n; } } -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT bb; - FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; - FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - - for (i = 0; i < n; i++) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/8; j ++) - { - pa0 = pc + j * 8; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - pa4 = pa3 + 1; - pa5 = pa4 + 1; - pa6 = pa5 + 1; - pa7 = pa6 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - aa4 = *pa4 * bb; - aa5 = *pa5 * bb; - aa6 = *pa6 * bb; - aa7 = *pa7 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - *pa4 = aa4; - *pa5 = aa5; - *pa6 = aa6; - *pa7 = aa7; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - *(a + 4)= aa4; - *(a + 5)= aa5; - *(a + 6)= aa6; - *(a + 7)= aa7; - - a += 8; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - pc2 = pa2 + ldc; - pc3 = pa3 + ldc; - pc4 = pa4 + ldc; - pc5 = pa5 + ldc; - pc6 = pa6 + ldc; - pc7 = pa7 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); - vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); - vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); - vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); - vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); - vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); - vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); - VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); - VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); - VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - pc4++; - pc5++; - pc6++; - pc7++; - } - } - pc += (m/8)*8; - - if (m & 4) - { - pa0 = pc; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - - a += 4; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - pc2 = pa2 + ldc; - pc3 = pa3 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - pc += 4; - } - - if (m & 2) - { - pa0 = pc; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - - a += 2; - - pb = b + i + 1; - pc0 = pa0 + ldc; - pc1 = pa1 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b + i + 1; - pc0 = pa0 + ldc; - for (k = (n - i - 1); k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b += n; - } -} #else -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa, bb; - int i, j, k; - - for (i = 0; i < n; i++) { - - bb = *(b + i); - - for (j = 0; j < m; j ++) { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - for (k = i + 1; k < n; k ++){ - *(c + j + k * ldc) -= aa * *(b + k); - } - - } - b += n; - } -} - -#endif +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { -#else + FLOAT bb1, bb2; -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *pci, *pcj; - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; + int i, j, k; - int i, j, k; + FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; - ldc *= 2; + size_t vl; - for (i = 0; i < n; i++) { + for (i = 0; i < n; i++) { - bb1 = *(b + i * 2 + 0); - bb2 = *(b + i * 2 + 1); + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); - for (j = 0; j < m; j ++) { - aa1 = *(c + j * 2 + 0 + i * ldc); - aa2 = *(c + j * 2 + 1 + i * ldc); + pci = c + i * ldc * 2; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + VLSEG2_FLOAT(&va1, &va2, pci, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va1, bb2, vl); + vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = -aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va2, bb1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif + VSSEG2_FLOAT(a, vs1, vs2, vl); + VSSEG2_FLOAT(pci, vs1, vs2, vl); + a += vl * 2; + pci += vl * 2; - *(a + 0) = cc1; - *(a + 1) = cc2; - *(c + j * 2 + 0 + i * ldc) = cc1; - *(c + j * 2 + 1 + i * ldc) = cc2; - a += 2; - - for (k = i + 1; k < n; k ++){ + for (k = i + 1; k < n; k ++){ + VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); #ifndef CONJ - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #else - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - } - + VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + } + pcj += vl * 2; + } + b += n * 2; } - b += n * 2; - } } #endif @@ -666,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, size_t vl = VSETVL_MAX; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug j = (n >> GEMM_UNROLL_N_SHIFT); diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c index 459c1663a..93a9e6916 100644 --- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -32,25 +32,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t #define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSEV_FLOAT vse32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 #define VSSEG2_FLOAT vsseg2e32_v_f32m2 #define VFMACCVF_FLOAT vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 #else #define VSETVL(n) vsetvl_e64m2(n) #define VSETVL_MAX vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t #define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSEV_FLOAT vse64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 #define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 #define VFMACCVF_FLOAT vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 #endif @@ -86,497 +85,38 @@ static FLOAT dm1 = -1.; #ifndef COMPLEX -#if GEMM_DEFAULT_UNROLL_N == 1 static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - FLOAT *pb, *pc; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc; - - a += (n - 1) * m; - b += (n - 1) * n; - - for (i = n - 1; i >= 0; i--) { - - bb = *(b + i); - - for (j = 0; j < m; j ++) { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - pb = b; - pc = c + j; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc = VLSEV_FLOAT(pc, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc = VFNMSACVF_FLOAT(vc, aa, vb, vl); - VSSEV_FLOAT(pc, stride_ldc, vc, vl); - pb += vl; - pc++; - } - } - b -= n; - a -= 2 * m; - } - -} -#elif GEMM_DEFAULT_UNROLL_N == 2 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT bb; + FLOAT *pci, *pcj; - FLOAT aa0, aa1, bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pc0, *pc1; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1; - - a += (n - 1) * m; - b += (n - 1) * n; + FLOAT_V_T va, vc; - for (i = n - 1; i >= 0; i--) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/2; j ++) - { - pa0 = pc + j * 2; - pa1 = pc + j * 2 + 1; - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b; - pc0 = c + j * 2; - pc1 = pc0 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - pb += vl; - pc0++; - pc1++; - } - } - pc += (m/2)*2; - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b; - pc0 = pc - i * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b -= n; - a -= 2 * m; - } -} - -#elif GEMM_DEFAULT_UNROLL_N == 4 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa0, aa1, aa2, aa3; - FLOAT bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3; - FLOAT *pc0, *pc1, *pc2, *pc3; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3; a += (n - 1) * m; b += (n - 1) * n; - for (i = n - 1; i >= 0; i--) - { - bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/4; j ++) - { - pa0 = pc + j * 4; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - a += 4; - - pb = b; - pc0 = c + j * 4; - pc1 = pc0 + 1; - pc2 = pc1 + 1; - pc3 = pc2 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - } - pc += (m/4)*4; - - if (m & 2) - { - pa0 = pc + j * 2; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b; - pc0 = c + j * 4; - pc1 = pc0 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b; - pc0 = pc - i * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; - } - } - b -= n; - a -= 2 * m; - } -} -#elif GEMM_DEFAULT_UNROLL_N == 8 - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7; - FLOAT bb; - FLOAT *pb, *pc; - FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; - FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7; - BLASLONG stride_ldc = sizeof(FLOAT) * ldc; - int i, j, k; - size_t vl; - FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7; - - a += (n - 1) * m; - b += (n - 1) * n; + for (i = n - 1; i >= 0; i--) { - for (i = n - 1; i >= 0; i--) - { bb = *(b + i); - pc = c + i * ldc; - for (j = 0; j < m/8; j ++) - { - pa0 = pc + j * 8; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - pa4 = pa3 + 1; - pa5 = pa4 + 1; - pa6 = pa5 + 1; - pa7 = pa6 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - aa4 = *pa4 * bb; - aa5 = *pa5 * bb; - aa6 = *pa6 * bb; - aa7 = *pa7 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - *pa4 = aa4; - *pa5 = aa5; - *pa6 = aa6; - *pa7 = aa7; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - *(a + 4)= aa4; - *(a + 5)= aa5; - *(a + 6)= aa6; - *(a + 7)= aa7; - a += 8; - - pb = b; - pc0 = c + j * 8; - pc1 = pc0 + 1; - pc2 = pc1 + 1; - pc3 = pc2 + 1; - pc4 = pc3 + 1; - pc5 = pc4 + 1; - pc6 = pc5 + 1; - pc7 = pc6 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl); - vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl); - vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl); - vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl); - vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl); - vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl); - vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - VSSEV_FLOAT(pc4, stride_ldc, vc4, vl); - VSSEV_FLOAT(pc5, stride_ldc, vc5, vl); - VSSEV_FLOAT(pc6, stride_ldc, vc6, vl); - VSSEV_FLOAT(pc7, stride_ldc, vc7, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - pc4++; - pc5++; - pc6++; - pc7++; - } - } - pc += (m/8)*8; - - if (m & 4) - { - pa0 = pc; - pa1 = pa0 + 1; - pa2 = pa1 + 1; - pa3 = pa2 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - aa2 = *pa2 * bb; - aa3 = *pa3 * bb; - - *pa0 = aa0; - *pa1 = aa1; - *pa2 = aa2; - *pa3 = aa3; - - *a = aa0; - *(a + 1)= aa1; - *(a + 2)= aa2; - *(a + 3)= aa3; - a += 4; - - pb = b; - pc0 = pc - i * ldc; - pc1 = pc0 + 1; - pc2 = pc1 + 1; - pc3 = pc2 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl); - vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl); - vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - VSSEV_FLOAT(pc2, stride_ldc, vc2, vl); - VSSEV_FLOAT(pc3, stride_ldc, vc3, vl); - - pb += vl; - pc0++; - pc1++; - pc2++; - pc3++; - } - pc += 4; - } - - if (m & 2) - { - pa0 = pc; - pa1 = pa0 + 1; - - aa0 = *pa0 * bb; - aa1 = *pa1 * bb; - - *pa0 = aa0; - *pa1 = aa1; - - *a = aa0; - *(a + 1)= aa1; - a += 2; - - pb = b; - pc0 = pc - i * ldc; - pc1 = pc0 + 1; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl); - - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - VSSEV_FLOAT(pc1, stride_ldc, vc1, vl); - - pb += vl; - pc0++; - pc1++; - } - pc += 2; - } - - if (m & 1) - { - pa0 = pc; - aa0 = *pa0 * bb; - - *pa0 = aa0; - *a = aa0; - a += 1; - - pb = b; - pc0 = pc - i * ldc; - for (k = i; k > 0; k -= vl) - { - vl = VSETVL(k); - vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl); - vb = VLEV_FLOAT(pb, vl); - vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl); - VSSEV_FLOAT(pc0, stride_ldc, vc0, vl); - pb += vl; - pc0++; + pci = c + i * ldc; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + va = VLEV_FLOAT(pci, vl); + va = VFMULVF_FLOAT(va, bb, vl); + VSEV_FLOAT(a, va, vl); + VSEV_FLOAT(pci, va, vl); + a += vl; + pci += vl; + for (k = 0; k < i; k ++){ + vc = VLEV_FLOAT(pcj + k * ldc, vl); + vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl); + VSEV_FLOAT(pcj + k * ldc, vc, vl); } + pcj += vl; } b -= n; a -= 2 * m; @@ -587,92 +127,65 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT aa, bb; - - int i, j, k; - - a += (n - 1) * m; - b += (n - 1) * n; + FLOAT bb1, bb2; - for (i = n - 1; i >= 0; i--) { + FLOAT *pci, *pcj; - bb = *(b + i); - - for (j = 0; j < m; j ++) { - aa = *(c + j + i * ldc); - aa *= bb; - *a = aa; - *(c + j + i * ldc) = aa; - a ++; - - for (k = 0; k < i; k ++){ - *(c + j + k * ldc) -= aa * *(b + k); - } - - } - b -= n; - a -= 2 * m; - } - -} - -#endif - -#else - -static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - - FLOAT aa1, aa2; - FLOAT bb1, bb2; - FLOAT cc1, cc2; - - int i, j, k; - - ldc *= 2; + int i, j, k; - a += (n - 1) * m * 2; - b += (n - 1) * n * 2; + FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; - for (i = n - 1; i >= 0; i--) { + size_t vl; - bb1 = *(b + i * 2 + 0); - bb2 = *(b + i * 2 + 1); + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; - for (j = 0; j < m; j ++) { + for (i = n - 1; i >= 0; i--) { - aa1 = *(c + j * 2 + 0 + i * ldc); - aa2 = *(c + j * 2 + 1 + i * ldc); + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + pci = c + i * ldc * 2; + pcj = c; + for (j = m; j > 0; j -= vl) { + vl = VSETVL(j); + VLSEG2_FLOAT(&va1, &va2, pci, vl); #ifndef CONJ - cc1 = aa1 * bb1 - aa2 * bb2; - cc2 = aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va1, bb2, vl); + vs2 = VFMACCVF_FLOAT(vs2, bb1, va2, vl); #else - cc1 = aa1 * bb1 + aa2 * bb2; - cc2 = - aa1 * bb2 + aa2 * bb1; + vs1 = VFMULVF_FLOAT(va1, bb1, vl); + vs1 = VFMACCVF_FLOAT(vs1, bb2, va2, vl); + vs2 = VFMULVF_FLOAT(va2, bb1, vl); + vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif + VSSEG2_FLOAT(a, vs1, vs2, vl); + VSSEG2_FLOAT(pci, vs1, vs2, vl); + a += vl * 2; + pci += vl * 2; - *(a + 0) = cc1; - *(a + 1) = cc2; - - *(c + j * 2 + 0 + i * ldc) = cc1; - *(c + j * 2 + 1 + i * ldc) = cc2; - a += 2; - - for (k = 0; k < i; k ++){ + for (k = 0; k < i; k ++){ + VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); #ifndef CONJ - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #else - *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); - *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); + vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); + vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); + vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - } - + VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + } + pcj += vl * 2; + } + b -= n * 2; + a -= 4 * m; } - b -= n * 2; - a -= 4 * m; - } - } #endif @@ -689,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, size_t vl = VSETVL_MAX; - //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug kk = n - offset; c += n * ldc * COMPSIZE; diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c new file mode 100644 index 000000000..389ee5d57 --- /dev/null +++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +// Optimizes the implementation in ../generic/zgemm_ncopy_4.c + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + + FLOAT *boffset; + + FLOAT_V_T v11, v12, v21, v22, v31, v32, v41, v42; + size_t vl; + + aoffset = a; + boffset = b; + lda *= 2; + + for (j = (n >> 2); j > 0; j--) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); + VLSEG2_FLOAT(&v31, &v32, aoffset3, vl); + VLSEG2_FLOAT(&v41, &v42, aoffset4, vl); + + VSSEG8_FLOAT(boffset, v11, v12, v21, v22, v31, v32, v41, v42, vl); + + aoffset1 += vl * 2; + aoffset2 += vl * 2; + aoffset3 += vl * 2; + aoffset4 += vl * 2; + boffset += vl * 8; + } + } + + if (n & 2) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); + + VSSEG4_FLOAT(boffset, v11, v12, v21, v22, vl); + + aoffset1 += vl * 2; + aoffset2 += vl * 2; + boffset += vl * 4; + } + } + + if (n & 1) { + aoffset1 = aoffset; + aoffset += lda; + + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + + VSSEG2_FLOAT(boffset, v11, v12, vl); + + aoffset1 += vl * 2; + boffset += vl * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c new file mode 100644 index 000000000..df039bab6 --- /dev/null +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *a_offset1; + FLOAT *b_offset; + + FLOAT_V_T v0, v1; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + a_offset = a; + b_offset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + a_offset1 = a_offset; + a_offset += vl * lda * 2; + + for(i = m; i > 0; i--) { + VLSSEG2_FLOAT(&v0, &v1, a_offset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(b_offset, v0, v1, vl); + + a_offset1 += 2; + b_offset += vl * 2; + } + } + return 0; +} + diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c new file mode 100644 index 000000000..1b34039c8 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -0,0 +1,181 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT vle32_v_f32m1 +#define VSEV_FLOAT vse32_v_f32m1 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 +#define VSSEG2_FLOAT vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#else +#define VSETVL(n) vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT vle64_v_f64m1 +#define VSEV_FLOAT vse64_v_f64m1 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 +#define VSSEG2_FLOAT vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3; + + FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + size_t vl; + + //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + boffset2 = b + 2 * m * (n & ~3); + boffset3 = b + 2 * m * (n & ~1); + + for(j = (m >> 2); j > 0; j--) { + + aoffset1 = aoffset; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 32; + + for(i = (n >> 2); i > 0; i--) { + vl = 4; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 2) { + vl = 4; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 16; + } + + if (n & 1) { + vl = 4; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + aoffset1 += 2; + boffset3 += 8; + } + } + + if (m & 2) { + aoffset1 = aoffset; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 16; + + for(i = (n >> 2); i > 0; i--) { + vl = 2; + + VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + + aoffset1 += 8; + boffset1 += m * 8; + } + + if (n & 2) { + vl = 2; + + VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + + aoffset1 += 4; + boffset2 += 8; + } + + if (n & 1) { + vl = 2; + + VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, v0, v1, vl); + + //aoffset1 += 2; + boffset3 += 4; + } + } + + if (m & 1) { + aoffset1 = aoffset; + boffset1 = boffset; + + for(i = (n >> 2); i > 0; i--) { + vl = 8; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v0, vl); + + aoffset1 += 8; + boffset1 += 8 * m; + } + + if (n & 2) { + vl = 4; + + v0 = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v0, vl); + + aoffset1 += 4; + //boffset2 += 4; + } + + if (n & 1) { + *(boffset3) = *(aoffset1); + *(boffset3 + 1) = *(aoffset1 + 1); + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c new file mode 100644 index 000000000..7622fb810 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1; + IFLOAT *boffset; + + FLOAT_V_T v0, v1; + size_t vl; + + //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); + + aoffset = a; + boffset = b; + + for(j = n; j > 0; j -= vl) { + vl = VSETVL(j); + + aoffset1 = aoffset; + aoffset += vl * 2; + + for(i = m; i > 0; i--) { + VLSEG2_FLOAT(&v0, &v1, aoffset1, vl); + VSSEG2_FLOAT(boffset, v0, v1, vl); + + aoffset1 += lda * 2; + boffset += vl * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c new file mode 100644 index 000000000..50e29222f --- /dev/null +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -0,0 +1,475 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + + //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc); // Debug + + size_t vl; + for (j = bn/4; j > 0; j--) + { + C0 = C; + C1 = C0 + 2 * ldc; + C2 = C1 + 2 * ldc; + C3 = C2 + 2 * ldc; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl); + ptrbb += 8; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl); + + ptrbb += 8; + } + + for (k = (bk & 3); k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + } + + VLSEG2_FLOAT(&va0, &va1, C0, vl); + VLSEG2_FLOAT(&va2, &va3, C1, vl); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + + VLSEG2_FLOAT(&va0, &va1, C2, vl); + VLSEG2_FLOAT(&va2, &va3, C3, vl); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres4, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres5, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); + VSSEG2_FLOAT(C2, va0, va1, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres6, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres7, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); + VSSEG2_FLOAT(C3, va2, va3, vl); + + C0 += vl * 2; + C1 += vl * 2; + C2 += vl * 2; + C3 += vl * 2; + } + + bb += (bk << 3); + C += (ldc << 3); + } + + if (bn & 2) + { + C0 = C; + C1 = C0 + 2 * ldc; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + ptrbb += 4; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + ptrbb += 4; + } + + for (k = (bk & 3); k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + } + + VLSEG2_FLOAT(&va0, &va1, C0, vl); + VLSEG2_FLOAT(&va2, &va3, C1, vl); + + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); + va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + + C0 += vl * 2; + C1 += vl * 2; + } + + bb += (bk << 2); + C += (ldc << 2); + } + + if (bn & 1) + { + C0 = C; + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); + ptrbb = bb; + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + + for (k = bk/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + ptrbb += 2; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + ptrbb += 2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + ptrbb += 2; + } + + for (k = (bk & 3); k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + ptrbb += 2; + } + + VLSEG2_FLOAT(&va0, &va1, C0, vl); + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); + va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + C0 += vl * 2; + } + + bb += bk << 1; + C += ldc << 1; + } + return 0; +} + diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..cf466d3fa --- /dev/null +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -0,0 +1,124 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY); + + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + + FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + vzero = VFMVVF_FLOAT(ZERO, vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX * 2 + posY * lda * 2; + ao2 = a + posY * 2 + posX * lda * 2; + + for (i = m; i > 0; i--, offset--) { + VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); + VLSEG2_FLOAT(&va10, &va11, ao1, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); + vbool_lt0 = VMSLT_VX_INT(vindex, 0, vl); + vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + + vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); + + vb1 = VMERGE_VVM_FLOAT(vbool_lt0, vb1, vb2, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += lda * 2; + ao2 += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c new file mode 100644 index 000000000..6209f5417 --- /dev/null +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -0,0 +1,120 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY); + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + + FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + VBOOL_T vbool_gt0, vbool_eq0; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + vzero = VFMVVF_FLOAT(ZERO, vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY * 2 + posX * lda * 2; + ao2 = a + posX * 2 + posY * lda * 2; + + for (i = m; i > 0; i--, offset--) { + VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); + VLSEG2_FLOAT(&va20, &va21, ao2, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); + vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + + vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); + + vb1 = VMERGE_VVM_FLOAT(vbool_gt0, vb1, vb2, vl); + vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += 2; + ao2 += lda * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c new file mode 100644 index 000000000..df5c916a5 --- /dev/null +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posX * 2 + posY * lda * 2; + ao2 = a + posY * 2 + (posX) * lda * 2; + + for (i = m; i > 0; i--, offset--) { + + VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); + VLSEG2_FLOAT(&va10, &va11, ao1, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += lda * 2; + ao2 += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c new file mode 100644 index 000000000..dcf2b081a --- /dev/null +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define VSETVL_MAX vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT vid_v_i32m2 +#define VADD_VX_INT vadd_vx_i32m2 +#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define VSETVL_MAX vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT vid_v_i64m2 +#define VADD_VX_INT vadd_vx_i64m2 +#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) +{ + BLASLONG i, js, offset; + + FLOAT *ao1, *ao2; + + BLASLONG stride_lda = sizeof(FLOAT)*lda * 2; + + FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + VBOOL_T vbool; + INT_V_T vindex_max, vindex; + + + size_t vl = VSETVL_MAX; + vindex_max = VID_V_INT(vl); + + for (js = n; js > 0; js -= vl, posX += vl) { + vl = VSETVL(js); + offset = posX - posY; + + ao1 = a + posY * 2 + (posX + 0) * lda * 2; + ao2 = a + posX * 2 + 0 + posY * lda * 2; + + for (i = m; i > 0; i--, offset--) { + VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); + VLSEG2_FLOAT(&va20, &va21, ao2, vl); + + vindex = VADD_VX_INT(vindex_max, offset, vl); + vbool = VMSGT_VX_INT(vindex, 0, vl); + + vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); + vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + VSSEG2_FLOAT(b, vb0, vb1, vl); + + b += vl * 2; + ao1 += 2; + ao2 += lda * 2; + } + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c new file mode 100644 index 000000000..afd694408 --- /dev/null +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -0,0 +1,145 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT vid_v_i32m2 +#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_V_T va0, va1; + + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY * 2 + posX * lda * 2; + } + else + { + ao = a + posX * 2 + posY * lda * 2; + } + + i = 0; + do + { + if (X > posY) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + + ao += 2; + b += vl * 2; + + X ++; + i ++; + } + else if (X < posY) + { + ao += lda * 2; + b += vl * 2; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + VSSEG2_FLOAT(b, va0, va1, vl); + ao += 2; + b += vl * 2; + } + + X += vl; + i += vl; + } + } while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..c7d593949 --- /dev/null +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -0,0 +1,143 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + + FLOAT *ao; + + FLOAT_V_T va0, va1; + size_t vl; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posY * 2 + posX * lda * 2; + } + else + { + ao = a + posX * 2 + posY * lda * 2; + } + + i = 0; + do + { + if (X > posY) + { + ao += 2; + b += vl * 2; + X++; + i++; + } + else if (X < posY) + { + //va1 = VLEV_FLOAT(ao, vl); + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + + ao += lda * 2; + b += vl * 2; + X ++; + i ++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + //va1 = VLEV_FLOAT(ao, vl); + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + //VSEV_FLOAT(b, vb, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + ao += lda * 2; + b += vl * 2; + } + X += vl; + i += vl; + + } + } while (i < m); + + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c new file mode 100644 index 000000000..3c70b6385 --- /dev/null +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -0,0 +1,144 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VLSEV_FLOAT vlse32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VLSEV_FLOAT vlse64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, X; + BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; + FLOAT *ao; + + FLOAT_V_T va0, va1; + size_t vl; + +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + X = posX; + + if (posX <= posY) + { + ao = a + posX * 2 + posY * lda * 2; + } + else + { + ao = a + posY * 2 + posX * lda * 2; + } + + i = 0; + do + { + if (X < posY) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + + ao += 2; + b += vl * 2; + + X++; + i++; + } + else if (X > posY) + { + ao += lda * 2; + b += vl * 2; + + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + VSSEG2_FLOAT(b, va0, va1, vl); + ao += 2; + b += vl * 2; + } + + X += vl; + i += vl; + } + }while (i < m); + + posY += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c new file mode 100644 index 000000000..706782cf0 --- /dev/null +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -0,0 +1,140 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, j, js, X; + + FLOAT *ao; + FLOAT_V_T va0, va1; +#ifdef UNIT + VBOOL_T vbool_eq; +#endif + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + + X = posX; + + if (posX <= posY) + { + ao = a + posX * 2 + posY * lda * 2; + } + else + { + ao = a + posY * 2 + posX * lda * 2; + } + + i = 0; + do + { + if (X < posY) + { + ao += 2; + b += vl * 2; + X++; + i++; + } + else if (X > posY) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + ao += lda * 2; + b += vl * 2; + X++; + i++; + } + else + { + vindex = VID_V_UINT(vl); + for (j = 0; j < vl; j++) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); +#ifdef UNIT + vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); + va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); + va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); +#endif + VSSEG2_FLOAT(b, va0, va1, vl); + ao += lda * 2; + b += vl * 2; + } + X += vl; + i += vl; + } + }while (i < m); + posY += vl; + } + + return 0; +} + diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c new file mode 100644 index 000000000..27409ec25 --- /dev/null +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -0,0 +1,574 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT vle32_v_f32m2 +#define VSEV_FLOAT vse32_v_f32m2 +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT vfmul_vf_f32m2 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT vle64_v_f64m2 +#define VSEV_FLOAT vse64_v_f64m2 +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT vfmul_vf_f64m2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFMACCVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFMACCVF_FLOAT +#define OP_ri VFMACCVF_FLOAT +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr VFMACCVF_FLOAT +#define OP_ir VFNMSACVF_FLOAT +#define OP_ii VFNMSACVF_FLOAT +#define OP_ri VFNMSACVF_FLOAT +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + BLASLONG off, temp; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; + FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; + + //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld, offset=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc, offset); // Debug + + size_t vl; + for (j = bn/4; j > 0; j--) + { + C0 = C; + C1 = C0 + 2 * ldc; + C2 = C1 + 2 * ldc; + C3 = C2 + 2 * ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*4*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + vres4 = VFMVVF_FLOAT(0.0, vl); + vres5 = VFMVVF_FLOAT(0.0, vl); + vres6 = VFMVVF_FLOAT(0.0, vl); + vres7 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+4; // number of values in B +#endif + + for (k = temp/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va2, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va3, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va3, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va2, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va2, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va3, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va3, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va2, vl); + + ptrbb += 8; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va4, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va5, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va5, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va4, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va4, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va5, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va5, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va4, vl); + + ptrbb += 8; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va6, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va7, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va7, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va6, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va6, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va7, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va7, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va6, vl); + + ptrbb += 8; + } + + for (k = temp & 3; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + vres4 = OP_rr(vres4, *(ptrbb + 4), va0, vl); + vres5 = OP_ir(vres5, *(ptrbb + 4), va1, vl); + vres4 = OP_ii(vres4, *(ptrbb + 5), va1, vl); + vres5 = OP_ri(vres5, *(ptrbb + 5), va0, vl); + + vres6 = OP_rr(vres6, *(ptrbb + 6), va0, vl); + vres7 = OP_ir(vres7, *(ptrbb + 6), va1, vl); + vres6 = OP_ii(vres6, *(ptrbb + 7), va1, vl); + vres7 = OP_ri(vres7, *(ptrbb + 7), va0, vl); + + ptrbb += 8; + } + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alphar, vl); + va3 = VFMULVF_FLOAT(vres3, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + + va0 = VFMULVF_FLOAT(vres4, alphar, vl); + va1 = VFMULVF_FLOAT(vres5, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); + VSSEG2_FLOAT(C2, va0, va1, vl); + + va2 = VFMULVF_FLOAT(vres6, alphar, vl); + va3 = VFMULVF_FLOAT(vres7, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); + VSSEG2_FLOAT(C3, va2, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 4; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*4*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + + C0 += vl * 2; + C1 += vl * 2; + C2 += vl * 2; + C3 += vl * 2; + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; +#endif + + bb += (bk << 3); + C += (ldc << 3); + } + + if (bn & 2) + { + C0 = C; + C1 = C0 + 2 * ldc; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*2*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + vres2 = VFMVVF_FLOAT(0.0, vl); + vres3 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+2; // number of values in B +#endif + for (k = temp/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va2, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va3, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va3, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va2, vl); + + ptrbb += 4; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va4, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va5, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va5, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va4, vl); + + ptrbb += 4; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va6, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va7, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va7, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va6, vl); + + ptrbb += 4; + } + + for (k = temp & 3; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + vres2 = OP_rr(vres2, *(ptrbb + 2), va0, vl); + vres3 = OP_ir(vres3, *(ptrbb + 2), va1, vl); + vres2 = OP_ii(vres2, *(ptrbb + 3), va1, vl); + vres3 = OP_ri(vres3, *(ptrbb + 3), va0, vl); + + ptrbb += 4; + } + + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + + va2 = VFMULVF_FLOAT(vres2, alphar, vl); + va3 = VFMULVF_FLOAT(vres3, alphar, vl); + va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); + va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); + VSSEG2_FLOAT(C1, va2, va3, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 2; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*2*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + C0 += vl * 2; + C1 += vl * 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; +#endif + bb += (bk << 2); + C += (ldc << 2); + } + + if (bn & 1) + { + C0 = C; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + ptrba = ba; + for (i = bm; i > 0; i -= vl) + { + vl = VSETVL(i); +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + ptrbb = bb; +#else + ptrba += off*vl*2; + ptrbb = bb + off*2; +#endif + + vres0 = VFMVVF_FLOAT(0.0, vl); + vres1 = VFMVVF_FLOAT(0.0, vl); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + temp = bk-off; +#elif defined(LEFT) + temp = off+vl; // number of values in A +#else + temp = off+1; // number of values in B +#endif + for (k = temp/4; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + + VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va3, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va3, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va2, vl); + + ptrbb += 2; + + VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va5, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va5, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va4, vl); + + ptrbb += 2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va6, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va7, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va7, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va6, vl); + + ptrbb += 2; + } + + for (k = temp & 3; k > 0; k--) + { + VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + ptrba += vl*2; + + vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); + vres1 = OP_ir(vres1, *(ptrbb + 0), va1, vl); + vres0 = OP_ii(vres0, *(ptrbb + 1), va1, vl); + vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); + + ptrbb += 2; + } + + va0 = VFMULVF_FLOAT(vres0, alphar, vl); + va1 = VFMULVF_FLOAT(vres1, alphar, vl); + va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); + va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); + VSSEG2_FLOAT(C0, va0, va1, vl); + +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + temp = bk - off; +#ifdef LEFT + temp -= vl; // number of values in A +#else + temp -= 1; // number of values in B +#endif + ptrba += temp*vl*2; + ptrbb += temp*2; +#endif + +#ifdef LEFT + off += vl; // number of values in A +#endif + C0 += vl * 2; + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; +#endif + bb += bk << 1; + C += ldc << 1; + } + return 0; +} diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c new file mode 100644 index 000000000..b7ccb1eb3 --- /dev/null +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT_V_T va0, va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + + compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); + ao += 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += 2; + b += vl * 2; + i++; + ii++; + } + } + + a += vl * lda * 2; + jj += vl; + } + + return 0; +} diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c new file mode 100644 index 000000000..911b81de5 --- /dev/null +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -0,0 +1,114 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + + FLOAT_V_T va0, va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); + + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + + b += vl * 2; + ao += lda * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += lda * 2; + b += vl * 2; + i ++; + ii ++; + } + } + + a += vl * 2; + jj += vl; + } + return 0; +} + diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c new file mode 100644 index 000000000..db075c29b --- /dev/null +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + + FLOAT *ao; + jj = offset; + + FLOAT_V_T va0, va1; + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + i = 0; + ii = 0; + for (i = 0; i < m;) + { + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + ao += 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii < jj) + { + VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += 2; + b += vl * 2; + i++; + ii++; + } + } + + a += vl * lda * 2; + jj += vl; + } + return 0; +} diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c new file mode 100644 index 000000000..e121c6273 --- /dev/null +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT vid_v_u32m2 +#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#else +#define VSETVL(n) vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT vid_v_u64m2 +#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#endif + + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + //fprintf(stderr, "%s , %s, m = %4ld n = %4ld lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug + + BLASLONG i, ii, jj, js; + + FLOAT *ao; + + jj = offset; + FLOAT_V_T va0, va1; + + VBOOL_T vbool_cmp; + UINT_V_T vindex; + + size_t vl; + + for (js = n; js > 0; js -= vl) + { + vl = VSETVL(js); + ao = a; + + ii = 0; + for (i = 0; i < m;) + { + + if (ii == jj) + { + vindex = VID_V_UINT(vl); + for (unsigned int j = 0; j < vl; j++) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + + compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); + + ao += lda * 2; + b += vl * 2; + } + i += vl; + ii += vl; + } + else + { + if (ii > jj) + { + VLSEG2_FLOAT(&va0, &va1, ao, vl); + VSSEG2_FLOAT(b, va0, va1, vl); + } + ao += lda * 2; + b += vl * 2; + i ++; + ii ++; + } + } + + a += vl * 2; + jj += vl; + } + + return 0; +} diff --git a/param.h b/param.h index 62b675d6c..236f50075 100644 --- a/param.h +++ b/param.h @@ -3055,11 +3055,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_N 8 //2 // 4 #define DGEMM_DEFAULT_UNROLL_MN 32 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 #define SGEMM_DEFAULT_P 160 #define DGEMM_DEFAULT_P 160 From 240695862984d4de845f1c42821a883946932df7 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Fri, 24 Feb 2023 10:44:55 +0000 Subject: [PATCH 010/718] * update intrinsics to match latest spec at https://github.com/riscv-non-isa/rvv-intrinsic-doc (in particular, __riscv_ prefixes for rvv intrinsics) * fix multiple numerical stability and corner case issues * add a script to generate arbitrary gemm kernel shapes * add a generic zvl256b target to demonstrate large gemm kernel unrolls --- common_riscv64.h | 15 +- cpuid_riscv64.c | 10 +- kernel/generic/trmmkernel_16x8.c | 3676 ++++++++++++++++++++ kernel/generic/zlaswp_ncopy_8.c | 1051 ++++++ kernel/riscv64/KERNEL.RISCV64_ZVL256B | 199 ++ kernel/riscv64/amax_vector.c | 231 +- kernel/riscv64/amin_vector.c | 252 +- kernel/riscv64/asum_vector.c | 99 +- kernel/riscv64/axpby_vector.c | 47 +- kernel/riscv64/axpy_vector.c | 42 +- kernel/riscv64/cgemm_kernel_8x8_zvl256b.c | 1931 ++++++++++ kernel/riscv64/copy_vector.c | 39 +- kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c | 2007 +++++++++++ kernel/riscv64/dgemm_kernel_8x8_zvl256b.c | 860 +++++ kernel/riscv64/dot.c | 2 +- kernel/riscv64/dot_vector.c | 86 +- kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c | 1068 ++++++ kernel/riscv64/gemv_n_vector.c | 24 +- kernel/riscv64/gemv_t_vector.c | 91 +- kernel/riscv64/generate_kernel.py | 670 ++++ kernel/riscv64/iamax_vector.c | 180 +- kernel/riscv64/iamin_vector.c | 160 +- kernel/riscv64/imax_vector.c | 124 +- kernel/riscv64/imin_vector.c | 185 +- kernel/riscv64/izamax_vector.c | 277 +- kernel/riscv64/izamin_vector.c | 275 +- kernel/riscv64/max_vector.c | 77 +- kernel/riscv64/min_vector.c | 77 +- kernel/riscv64/nrm2_vector.c | 342 +- kernel/riscv64/nrm2_vector_dot.c | 8 +- kernel/riscv64/rot_vector.c | 42 +- kernel/riscv64/scal_vector.c | 83 +- kernel/riscv64/sgemm_kernel_16x8_zvl256b.c | 1081 ++++++ kernel/riscv64/strmm_kernel_16x8_zvl256b.c | 1330 +++++++ kernel/riscv64/sum_vector.c | 114 + kernel/riscv64/swap_vector.c | 54 +- kernel/riscv64/symv_L_vector.c | 82 +- kernel/riscv64/symv_U_vector.c | 86 +- kernel/riscv64/zamax_vector.c | 90 +- kernel/riscv64/zamin_vector.c | 89 +- kernel/riscv64/zasum_vector.c | 107 +- kernel/riscv64/zaxpby_vector.c | 32 +- kernel/riscv64/zaxpy_vector.c | 20 +- kernel/riscv64/zcopy_vector.c | 12 +- kernel/riscv64/zdot_vector.c | 60 +- kernel/riscv64/zgemm_kernel_8x4_zvl256b.c | 1253 +++++++ kernel/riscv64/zgemm_kernel_generic.c | 140 + kernel/riscv64/zgemv_n_vector.c | 28 +- kernel/riscv64/zgemv_t_vector.c | 88 +- kernel/riscv64/zhemv_LM_vector.c | 60 +- kernel/riscv64/zhemv_UV_vector.c | 60 +- kernel/riscv64/znrm2_vector.c | 365 +- kernel/riscv64/zrot_vector.c | 38 +- kernel/riscv64/zscal_vector.c | 32 +- kernel/riscv64/zsum_vector.c | 131 + kernel/riscv64/zswap_vector.c | 50 +- kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c | 1337 +++++++ param.h | 39 + 58 files changed, 18634 insertions(+), 2374 deletions(-) create mode 100644 kernel/generic/trmmkernel_16x8.c create mode 100644 kernel/generic/zlaswp_ncopy_8.c create mode 100644 kernel/riscv64/KERNEL.RISCV64_ZVL256B create mode 100644 kernel/riscv64/cgemm_kernel_8x8_zvl256b.c create mode 100644 kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c create mode 100644 kernel/riscv64/dgemm_kernel_8x8_zvl256b.c create mode 100644 kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c create mode 100755 kernel/riscv64/generate_kernel.py create mode 100644 kernel/riscv64/sgemm_kernel_16x8_zvl256b.c create mode 100644 kernel/riscv64/strmm_kernel_16x8_zvl256b.c create mode 100644 kernel/riscv64/sum_vector.c create mode 100644 kernel/riscv64/zgemm_kernel_8x4_zvl256b.c create mode 100644 kernel/riscv64/zgemm_kernel_generic.c create mode 100644 kernel/riscv64/zsum_vector.c create mode 100644 kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c diff --git a/common_riscv64.h b/common_riscv64.h index 2092bd5ab..de79c8cab 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,12 +91,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) -#include -#endif - -#if defined(x280) -#include +#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(__riscv_v) +# include +# if !defined(DOUBLE) +# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f32m1_f32(v) +# else +# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f64m1_f64(v) +# endif +#else +# define EXTRACT_FLOAT(v) (v[0]) #endif #endif diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 5326787e6..1b6b62f21 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -70,14 +70,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_GENERIC 0 -#define CPU_C910V 1 -#define CPU_x280 2 +#define CPU_GENERIC 0 +#define CPU_C910V 1 +#define CPU_RISCV64_ZVL256B 2 static char *cpuname[] = { "RISCV64_GENERIC", - "C910V" - "x280" + "C910V", + "CPU_RISCV64_ZVL256B" }; int detect(void){ diff --git a/kernel/generic/trmmkernel_16x8.c b/kernel/generic/trmmkernel_16x8.c new file mode 100644 index 000000000..5412eab70 --- /dev/null +++ b/kernel/generic/trmmkernel_16x8.c @@ -0,0 +1,3676 @@ +#include "common.h" + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + FLOAT res0_4; + FLOAT res0_5; + FLOAT res0_6; + FLOAT res0_7; + + FLOAT res0_8; + FLOAT res0_9; + FLOAT res0_10; + FLOAT res0_11; + FLOAT res0_12; + FLOAT res0_13; + FLOAT res0_14; + FLOAT res0_15; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + FLOAT res1_4; + FLOAT res1_5; + FLOAT res1_6; + FLOAT res1_7; + + FLOAT res1_8; + FLOAT res1_9; + FLOAT res1_10; + FLOAT res1_11; + FLOAT res1_12; + FLOAT res1_13; + FLOAT res1_14; + FLOAT res1_15; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + FLOAT res2_4; + FLOAT res2_5; + FLOAT res2_6; + FLOAT res2_7; + + FLOAT res2_8; + FLOAT res2_9; + FLOAT res2_10; + FLOAT res2_11; + FLOAT res2_12; + FLOAT res2_13; + FLOAT res2_14; + FLOAT res2_15; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + FLOAT res3_4; + FLOAT res3_5; + FLOAT res3_6; + FLOAT res3_7; + + FLOAT res3_8; + FLOAT res3_9; + FLOAT res3_10; + FLOAT res3_11; + FLOAT res3_12; + FLOAT res3_13; + FLOAT res3_14; + FLOAT res3_15; + + FLOAT res4_0; + FLOAT res4_1; + FLOAT res4_2; + FLOAT res4_3; + FLOAT res4_4; + FLOAT res4_5; + FLOAT res4_6; + FLOAT res4_7; + + FLOAT res4_8; + FLOAT res4_9; + FLOAT res4_10; + FLOAT res4_11; + FLOAT res4_12; + FLOAT res4_13; + FLOAT res4_14; + FLOAT res4_15; + + FLOAT res5_0; + FLOAT res5_1; + FLOAT res5_2; + FLOAT res5_3; + FLOAT res5_4; + FLOAT res5_5; + FLOAT res5_6; + FLOAT res5_7; + + FLOAT res5_8; + FLOAT res5_9; + FLOAT res5_10; + FLOAT res5_11; + FLOAT res5_12; + FLOAT res5_13; + FLOAT res5_14; + FLOAT res5_15; + + FLOAT res6_0; + FLOAT res6_1; + FLOAT res6_2; + FLOAT res6_3; + FLOAT res6_4; + FLOAT res6_5; + FLOAT res6_6; + FLOAT res6_7; + + FLOAT res6_8; + FLOAT res6_9; + FLOAT res6_10; + FLOAT res6_11; + FLOAT res6_12; + FLOAT res6_13; + FLOAT res6_14; + FLOAT res6_15; + + FLOAT res7_0; + FLOAT res7_1; + FLOAT res7_2; + FLOAT res7_3; + FLOAT res7_4; + FLOAT res7_5; + FLOAT res7_6; + FLOAT res7_7; + + FLOAT res7_8; + FLOAT res7_9; + FLOAT res7_10; + FLOAT res7_11; + FLOAT res7_12; + FLOAT res7_13; + FLOAT res7_14; + FLOAT res7_15; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + FLOAT b4; + FLOAT b5; + FLOAT b6; + FLOAT b7; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j=0; j +#include "common.h" + +#define a2 (a1 + 2) +#define a4 (a3 + 2) +#define a6 (a5 + 2) +#define a8 (a7 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, A3, A4, A5, A6, A7, A8; + FLOAT B1, B2, B3, B4, B5, B6, B7, B8; + + FLOAT A9, A10, A11, A12, A13, A14, A15, A16; + FLOAT B9, B10, B11, B12, B13, B14, B15, B16; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 3); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + for( int pass = 0; pass < 2; ++pass ) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + b1 += 4*lda; + b2 += 4*lda; + b3 += 4*lda; + b4 += 4*lda; + b5 += 4*lda; + b6 += 4*lda; + b7 += 4*lda; + b8 += 4*lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + buffer += 16; + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + + if (n & 4) { + { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + + buffer += 16; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + } + } //if (n & 4) + + if (n & 2) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + a3 = a1 + lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else { + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else { + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + a1 += 4; + a3 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + buffer += 4; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + // buffer += 2; + } + } + + return 0; +} + diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B new file mode 100644 index 000000000..d8690682f --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -0,0 +1,199 @@ +SAMAXKERNEL = amax_vector.c +DAMAXKERNEL = amax_vector.c +CAMAXKERNEL = zamax_vector.c +ZAMAXKERNEL = zamax_vector.c + +SAMINKERNEL = amin_vector.c +DAMINKERNEL = amin_vector.c +CAMINKERNEL = zamin_vector.c +ZAMINKERNEL = zamin_vector.c + +SMAXKERNEL = max_vector.c +DMAXKERNEL = max_vector.c + +SMINKERNEL = min_vector.c +DMINKERNEL = min_vector.c + +ISAMAXKERNEL = iamax_vector.c +IDAMAXKERNEL = iamax_vector.c +ICAMAXKERNEL = izamax_vector.c +IZAMAXKERNEL = izamax_vector.c + +ISAMINKERNEL = iamin_vector.c +IDAMINKERNEL = iamin_vector.c +ICAMINKERNEL = izamin_vector.c +IZAMINKERNEL = izamin_vector.c + +ISMAXKERNEL = imax_vector.c +IDMAXKERNEL = imax_vector.c + +ISMINKERNEL = imin_vector.c +IDMINKERNEL = imin_vector.c + +SASUMKERNEL = asum_vector.c +DASUMKERNEL = asum_vector.c +CASUMKERNEL = zasum_vector.c +ZASUMKERNEL = zasum_vector.c + +SSUMKERNEL = sum_vector.c +DSUMKERNEL = sum_vector.c +CSUMKERNEL = zsum_vector.c +ZSUMKERNEL = zsum_vector.c + +SAXPYKERNEL = axpy_vector.c +DAXPYKERNEL = axpy_vector.c +CAXPYKERNEL = zaxpy_vector.c +ZAXPYKERNEL = zaxpy_vector.c + +SCOPYKERNEL = copy_vector.c +DCOPYKERNEL = copy_vector.c +CCOPYKERNEL = zcopy_vector.c +ZCOPYKERNEL = zcopy_vector.c + +SDOTKERNEL = dot_vector.c +DDOTKERNEL = dot_vector.c +CDOTKERNEL = zdot_vector.c +ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = nrm2_vector.c +DNRM2KERNEL = nrm2_vector.c +CNRM2KERNEL = znrm2_vector.c +ZNRM2KERNEL = znrm2_vector.c + +SROTKERNEL = rot_vector.c +DROTKERNEL = rot_vector.c +CROTKERNEL = zrot_vector.c +ZROTKERNEL = zrot_vector.c + +SSCALKERNEL = scal_vector.c +DSCALKERNEL = scal_vector.c +CSCALKERNEL = zscal_vector.c +ZSCALKERNEL = zscal_vector.c + +SSWAPKERNEL = swap_vector.c +DSWAPKERNEL = swap_vector.c +CSWAPKERNEL = zswap_vector.c +ZSWAPKERNEL = zswap_vector.c + +SGEMVNKERNEL = gemv_n_vector.c +DGEMVNKERNEL = gemv_n_vector.c +CGEMVNKERNEL = zgemv_n_vector.c +ZGEMVNKERNEL = zgemv_n_vector.c + +SGEMVTKERNEL = gemv_t_vector.c +DGEMVTKERNEL = gemv_t_vector.c +CGEMVTKERNEL = zgemv_t_vector.c +ZGEMVTKERNEL = zgemv_t_vector.c + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_vector.c +SSYMV_L_KERNEL = symv_L_vector.c +DSYMV_U_KERNEL = symv_U_vector.c +DSYMV_L_KERNEL = symv_L_vector.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + +CHEMV_L_KERNEL = zhemv_LM_vector.c +CHEMV_M_KERNEL = zhemv_LM_vector.c +CHEMV_U_KERNEL = zhemv_UV_vector.c +CHEMV_V_KERNEL = zhemv_UV_vector.c +ZHEMV_L_KERNEL = zhemv_LM_vector.c +ZHEMV_M_KERNEL = zhemv_LM_vector.c +ZHEMV_U_KERNEL = zhemv_UV_vector.c +ZHEMV_V_KERNEL = zhemv_UV_vector.c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index 1b7799340..81a39af32 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -28,36 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -65,103 +66,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=0.0; if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_zero; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_zero = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T v0, v1; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; - FLOAT zero = 0.0; if(inc_x == 1){ gvl = VSETVL(n); if(gvl <= n/2){ - v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = *((FLOAT*)&v_res); + v0 = VFABS_FLOAT(v0, gvl); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } }else{ @@ -169,94 +95,27 @@ asm volatile( BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ BLASLONG inc_xv = inc_x * gvl; - v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = *((FLOAT*)&v_res); + v0 = VFABS_FLOAT(v0, gvl); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } } + + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index f9b7defae..c8ba75f4a 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -26,232 +26,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include -#include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - if (n <= 0 || inc_x <= 0) return(0.0); - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT minf=0.0; + if (n <= 0 || inc_x <= 0) return(minf); + + minf = *x; + x += inc_x; + --n; + if (n == 0) return(minf); + unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T v0, v1; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(minf, 1); - MASK_T mask0, mask1; - FLOAT zero = 0.0; if(inc_x == 1){ gvl = VSETVL(n); if(gvl <= n/2){ - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFADDVV_FLOAT vfadd_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# else +# define ELEN 32 +# endif #endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -64,75 +67,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT asumf=0.0; if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; - FLOAT_V_T v0, v1, v_zero,v_sum; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T v0, v1, v_sum; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; if(inc_x == 1){ gvl = VSETVL(n); - v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -93,13 +91,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else if(inc_y == 1){ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -119,14 +116,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else if(inc_x == 1){ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_y = inc_y * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -146,15 +141,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } }else{ gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); - int stride_x = inc_x * sizeof(FLOAT); - int stride_y = inc_y * sizeof(FLOAT); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } //tail if(j < n){ @@ -174,9 +167,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - dot += (double)VFMVFS_FLOAT(v_res); - + v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); + dot += (double)EXTRACT_FLOAT(v_res); } } return(dot); diff --git a/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c new file mode 100644 index 000000000..b1739f248 --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c @@ -0,0 +1,1068 @@ +/* + +AUTOGENERATED KERNEL +Settings: + LMUL=1 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl256b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=256 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=1 + VFMACC='__riscv_vfmacc_vf_f64m1' + VFMUL='__riscv_vfmul_vf_f64m1' + VLEV='__riscv_vle64_v_f64m1' + VLSEV='__riscv_vlse64_v_f64m1' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' + VSETVL='__riscv_vsetvl_e64m1' + VSEV='__riscv_vse64_v_f64m1' + VSSEV='__riscv_vsse64_v_f64m1' + acc_vector_t='vfloat64m1_t' + output='dtrmm_kernel_8x8_zvl256b.c' + param_scalar_t='double' + param_vector_t='vfloat64m1_t' + +*/ + +#include "common.h" + + + +#if defined(LEFT) != defined(TRANSA) + #define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + + // -- MAIN PASS + + for (BLASLONG j=0; j 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j: + tmp_unroll_j = int(tmp_unroll_j / 2) + + if tmp_unroll_i < a_regs or tmp_unroll_j < N: + dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i) + + for tj in range(0, N, tmp_unroll_j): + for ti in range(0, a_regs, tmp_unroll_i): + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + if ti == 0 and tj==0: + dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + else: + dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") + dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") + + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;") + dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;") + + with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): + for i in range(N): + dest.write("B{i}r = B[bi+{i}*2+0];", i=i) + dest.write("B{i}i = B[bi+{i}*2+1];", i=i) + dest.write("bi += {N}*2;") + dest.write() + + for i in range(a_regs): + dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) + dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) + + dest.write("ai += {M}*2;") + dest.write() + + + for tj in range(0, N, tmp_unroll_j): + for ti in range(0, a_regs, tmp_unroll_i): + # note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign + # so accumulating them directly to ACC would lose precision when ACC is larger + + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") + dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") + dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") + for j in range(tj, tj+tmp_unroll_j): + for i in range(ti, ti+tmp_unroll_i): + with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): + dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);") + dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);") + + dest.write() + dest.write("{index_type} ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + if TRMM: + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );") + dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );") + else: + for i in range(a_regs): + idx = j*a_regs+i + increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + with dest.map(idx=j*a_regs+i, increment=increment): + dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );") + dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );") + dest.write("{increment}") + + if not TRMM: + for j in range(N): + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );") + dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );") + + for j in range(N): + for i in range(a_regs): + with dest.map(idx=j*a_regs+i): + dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );") + dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );") + + if not TRMM: + dest.write() + dest.write("ci=n_top*ldc+m_top;") + dest.write() + + for j in range(N): + for i in range(a_regs): + idx = j*a_regs+i + increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' + if idx == N*a_regs-1: + increment = '' + with dest.map(idx=j*a_regs+i, increment=increment): + dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);") + dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);") + dest.write("{increment}") + +#----------------------------------------------------------------------- +def generate_gemm_kernel( settings, OUTPUT ): + if settings['conjugate'].value: + ERROR('conjugate gemm not yet supported') + + is_complex = settings['complex'].value + generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real + dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings }) + + M = settings['M'].value + N = settings['N'].value + vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) + a_regs = max(int(M/vlenmax), 1) + + accumulation_regs = a_regs * N * settings['LMUL_ACC'].value + required_regs = accumulation_regs + a_regs + if is_complex: + required_regs = required_regs * 2 + 2 + dest.write(''' +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define S0 1 + #define S1 -1 + #define S2 1 + #define S3 1 + #define VFMACC_RR __riscv_vfmsac{tail_policy} + #define VFMACC_RI __riscv_vfmacc{tail_policy} +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define S0 1 + #define S1 1 + #define S2 1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmacc{tail_policy} + #define VFMACC_RI __riscv_vfmsac{tail_policy} +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define S0 1 + #define S1 1 + #define S2 -1 + #define S3 1 + #define VFMACC_RR __riscv_vfmacc{tail_policy} + #define VFMACC_RI __riscv_vfnmsac{tail_policy} +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define S0 1 + #define S1 -1 + #define S2 -1 + #define S3 -1 + #define VFMACC_RR __riscv_vfmsac{tail_policy} + #define VFMACC_RI __riscv_vfnmacc{tail_policy} +#endif +'''.format(tail_policy=settings['tail_policy'].value)) + + + if required_regs > 32: + raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format( + required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '') + )) + + TRMM = (settings['op'].value == 'trmm') + if TRMM: + with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"): + dest.write("#define BACKWARDS") + + dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})", + alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'), + trmm = (', BLASLONG offset' if TRMM else '') + ) + + with dest.block("{{", "}}", elt_size='*2' if is_complex else ''): + if settings['trace'].value: + dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);") + dest.write("{index_type} gvl = 0;") + dest.write("{index_type} m_top = 0;") + dest.write("{index_type} n_top = 0;") + + dest.write() + dest.write() + dest.write("// -- MAIN PASS") + + with dest.block("for ({index_type} j=0; j 0 ): + with dest.map(N=N_tail): + dest.write() + dest.write() + dest.write("// -- tails for N={N}") + with dest.block("if( N & {N} ) {{", "}}" ): + if settings['trace'].value: + dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1))) + dest.write("m_top = 0;") + with dest.block("for ({index_type} i=0; i M_tail_min ): + with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ): + if settings['trace'].value: + dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + a_regs = max( 1, int(M_tail/vlenmax) ) + vlen = int(M_tail/a_regs) + dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen) + + generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs ) + dest.write( "m_top += {M_tail};" ) + + M_tail = int( M_tail / 2 ) + + while( M_tail > 0 ): + with dest.block("if( M & {M_tail} ) {{", "}}", + M_tail=M_tail, + N=N, + result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value) + ): + if settings['trace'].value: + dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") + for r in range(M_tail * N * (2 if is_complex else 1)): + dest.write("{result_t} result{r} = 0;", + r=r + ) + + dest.write("{index_type} ai=m_top*K{elt_size};") + dest.write("{index_type} bi=n_top*K{elt_size};") + + if TRMM: + with dest.map(M=M_tail, N=N): + generate_trmm_block( dest ) + + with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ): + for ki in range( N ): + for kj in range( M_tail ): + if is_complex: + dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format( + dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2 + )) + dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format( + dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2 + )) + else: + dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format( + dest=ki*M_tail+kj, kj=kj, ki=ki + )) + dest.write("ai+={M_tail}{elt_size};") + dest.write("bi+={N}{elt_size};") + + dest.write("{index_type} ci=n_top*ldc+m_top;") + if is_complex: + dest.write("{result_t} Cr, Ci;") + for ki in range( N ): + for kj in range( M_tail ): + if is_complex: + if TRMM: + dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) + dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) + else: + dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj) + dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj) + dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) + dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) + dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1) + dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0) + dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj ) + dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj ) + else: + op = '' if TRMM else '+' + dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};", + ki=ki, kj=kj, op=op, dest=ki*M_tail+kj + ) + dest.write("m_top+={M_tail};") + + M_tail = int(M_tail/2) + + +#----------------------------------------------------------------------- +class Setting(object): + def __init__( self, value, convert = None ): + self._value = value + self._convert = convert + + @classmethod + def ENUM( cls, *values ): + def closure( values ): + return lambda value: values[value.lower()] + return closure( { v.lower():v for v in values } ) + + @classmethod + def BOOL( cls, value ): + return value.lower().startswith('t') or value == '1' + + @property + def value( self ): + return self._value + + @property + def configurable( self ): + return self._convert is not None + + @value.setter + def value( self, value ): + self._value = self._convert( value ) + + def __str__( self ): + return str(self._value) + +#----------------------------------------------------------------------- +def main(): + settings = { + 'op': Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ), + 'M': Setting( 16, int ), + 'N': Setting( 4, int ), + 'reg_width_bits': Setting( 256, int ), + 'LMUL': Setting( 1, int ), + 'M_tail_scalar_from':Setting( 2, int ), + 'cpu': Setting( 'zvl256b', str ), + 'param_precision': Setting( 'float', Setting.ENUM( 'float', 'double' ) ), + 'force_acc_double': Setting( False, Setting.BOOL ), + 'complex': Setting( False, Setting.BOOL ), + 'conjugate': Setting( False, Setting.BOOL ), + 'index_type': Setting( 'BLASLONG', str ), + 'trace': Setting( False, Setting.BOOL ), + 'output': Setting( None, str ), + 'tail_policy': Setting( '', str ), # _ta, if toolchain supports it + '__riscv_': Setting( '__riscv_', str), + } + + for item in sys.argv[1:]: + try: + name, value = tuple(item.split( '=', 1 )) + except: + ERROR("couldn't parse {}, expected arguments of the form name=value".format(item)) + + if name not in settings: + ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name ) + +"options (and current defaults) are\n{}".format( + " ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()])) + ) + + try: + settings[name].value = value + except: + import traceback + traceback.print_exc() + ERROR("couldn't parse {}".format(item)) + + if settings['output'].value is None: + if settings['complex'].value: + prefix = 'z' if settings['param_precision'].value == 'double' else 'c' + else: + prefix = 'd' if settings['param_precision'].value == 'double' else 's' + settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format( + prefix, + settings['op'], + settings['M'], + settings['N'], + settings['cpu'] + )) + + if settings['param_precision'].value == 'double': + settings['param_scalar_t'] = Setting( 'double' ) + settings['ELEN_PARAM'] = Setting(64) + else: + settings['param_scalar_t'] = Setting( 'float' ) + settings['ELEN_PARAM'] = Setting(32) + + settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) + settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) + + settings['ELEN_ACC'] = settings['ELEN_PARAM'] + settings['LMUL_ACC'] = Setting(settings['LMUL'].value) + widen = '' + + if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'): + settings['ELEN_ACC'] = Setting(64) + settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2) + settings['VFNCVT'] = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy'])) + widen = 'w' + + settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) + settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) + + settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL'])) + settings['acc_vector_t'] =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC'])) + settings['VLEV'] =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSEV'] =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VLSEV'] =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSSEV'] =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) + settings['VSETVL'] =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'])) + + + to_stdout = (settings['output'].value == '-') + if not to_stdout: + print("Writing {}".format(settings['output'].value), file=sys.stderr) + + with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file: + def OUTPUT(*args, **kwargs): + print(*args, file=destination_file, **kwargs) + + OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable]))) + OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable]))) + + OUTPUT('#include "common.h"') + OUTPUT("\n") + + if settings['op'].value in ('gemm', 'trmm'): + generate_gemm_kernel(settings, OUTPUT) + else: + ERROR("unsupported kernel type {}".format(settings['op'])) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c index 9fea522f7..92880fbcf 100644 --- a/kernel/riscv64/iamax_vector.c +++ b/kernel/riscv64/iamax_vector.c @@ -27,118 +27,111 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include +#include #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16 +#define VMFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFABS_FLOAT __riscv_vfabs_v_f64m4 +#define VCOMPRESS __riscv_vcompress_vm_u64m4 +#define VMV_X __riscv_vmv_x_s_u64m4_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFABS_FLOAT __riscv_vfabs_v_f32m4 +#define VCOMPRESS __riscv_vcompress_vm_u32m4 +#define VMV_X __riscv_vmv_x_s_u32m4_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT maxf=0.0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; FLOAT_V_T vx, v_max; UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); + + gvl = VSETVL(n); + UINT_V_T vid = VIDV_UINT(gvl); if(inc_x == 1){ - gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_max = VLEV_FLOAT(&x[j], gvl); + v_max = VFABS_FLOAT(v_max, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index - v_max_index = VIDV_UINT(gvl); - v_max_index = VADDVX_UINT(v_max_index, j, gvl); + v_max_index = VADDVX_UINT(vid, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } }else{ @@ -146,51 +139,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = *((FLOAT*)&v_res); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + v_max = VFABS_FLOAT(v_max, gvl); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); if(cur_maxf > maxf){ //tail index - v_max_index = VIDV_UINT(gvl); - v_max_index = VADDVX_UINT(v_max_index, j, gvl); + v_max_index = VADDVX_UINT(vid, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } } - return(max_index+1); + return(max_index+1); } - - diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index 4e81e7848..0503f9948 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -31,85 +31,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFABS_FLOAT __riscv_vfabs_v_f64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFABS_FLOAT __riscv_vfabs_v_f32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; FLOAT_V_T vx, v_min; UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); @@ -117,29 +111,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLEV_FLOAT(&x[j], gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_min = VLEV_FLOAT(&x[j], gvl); + v_min = VFABS_FLOAT(v_min, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } }else{ @@ -151,12 +145,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + vx = VFABS_FLOAT(vx, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); @@ -165,33 +157,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + v_min = VFABS_FLOAT(v_min, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } } - return(min_index+1); + return(min_index+1); } - - diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index ca48a3c48..e24f9fd48 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -31,68 +31,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); - FLOAT maxf=-FLT_MAX; + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; FLOAT_V_T vx, v_max; UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_min; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -104,32 +102,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_max = VLEV_FLOAT(&x[j], gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } }else{ @@ -145,37 +145,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } } - return(max_index+1); + return(max_index+1); } - - diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index 2a677098d..a60bd3d07 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -31,122 +31,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define ABS fabs -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; FLOAT_V_T vx, v_min; UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); - v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#endif -*/ - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); + + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_min = VLEV_FLOAT(&x[j], gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } }else{ @@ -159,59 +142,39 @@ asm volatile( for(i=0,j=0; i < n/gvl; i++){ vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //index where element less than v_min - mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask), "r"(gvl) - :"v0"); -#endif -*/ - - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); mask = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); - - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = *((FLOAT*)&v_res); - if(cur_minf < minf){ + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask,gvl); - min_index = *((unsigned int*)&v_min_index+min_index); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } } - return(min_index+1); + return(min_index+1); } - - diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index 66a101566..89cd510c1 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -27,241 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include +#include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VSEVU_UINT vse64_v_u64m8 +#define VSEVU_UINT __riscv_vse64_v_u64m8 #define UINT_T long unsigned int -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFABS_FLOAT __riscv_vfabs_v_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t #define UINT_T unsigned int -#define VSEVU_UINT vse32_v_u32m8 -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VSEVU_UINT __riscv_vse32_v_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFABS_FLOAT __riscv_vfabs_v_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif -#define RVV_M RVV_M8 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT maxf=0.0; + BLASLONG i=0, j=0; unsigned int max_index = 0; - if (n <= 0 || inc_x <= 0) return(max_index); + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; - FLOAT_V_T vx0, vx1, v_max; + FLOAT_V_T vx, vx2, v_max; UINT_V_T v_max_index; - MASK_T mask0, mask1; + MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); gvl = VSETVL(n); - UINT_T temp_uint[gvl]; + unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x * 2; + + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); v_max_index = VMVVX_UINT(0, gvl); - v_max = VFMVVF_FLOAT(-1, gvl); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = gvl * inc_x * 2; - BLASLONG ix = 0; for(i=0,j=0; i < n/gvl; i++){ - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + vx = VFABS_FLOAT(vx, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + vx = VFADDVV_FLOAT(vx, vx2, gvl); + //index where element greater than v_max - mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); - v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_max_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_max_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#endif -*/ - v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; - ix += inc_xv; + idx += inc_v; } - vx0 = VFMVVF_FLOAT(0, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = VFMVFS_FLOAT(v_res); - mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); - max_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_max_index,gvl); - max_index = temp_uint[max_index]; - + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + maxf = EXTRACT_FLOAT(v_res); + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - v_max_index = VMVVX_UINT(0, gvl); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - v_max = VFADDVV_FLOAT(vx0, vx1, gvl); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = VFMVFS_FLOAT(v_res); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + v_max = VFABS_FLOAT(v_max, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + v_max = VFADDVV_FLOAT(v_max, vx2, gvl); + + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); + FLOAT cur_maxf = EXTRACT_FLOAT(v_res); + if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); v_max_index = VADDVX_UINT(v_max_index, j, gvl); - mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); - max_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_max_index,gvl); - max_index = temp_uint[max_index]; - + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_max_index, mask, gvl); + max_index = VMV_X(compressed); } } - return(max_index+1); -} - + return(max_index+1); +} diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 818193a9e..74daf32b8 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -31,235 +31,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VMFIRSTM vmfirst_m_b8 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VMFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t #define VSEVU_UINT vse64_v_u64m8 #define UINT_T long unsigned int -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VMVVX_UINT vmv_v_x_u64m8 +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFABS_FLOAT __riscv_vfabs_v_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VCOMPRESS __riscv_vcompress_vm_u64m8 +#define VMV_X __riscv_vmv_x_s_u64m8_u64 #else -#define ABS fabsf -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VMFIRSTM vmfirst_m_b4 +#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VMFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t #define UINT_T unsigned int -#define VSEVU_UINT vse32_v_u32m8 -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VMVVX_UINT vmv_v_x_u32m8 +#define VSEVU_UINT __riscv_vse32_v_u32m8 +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFABS_FLOAT __riscv_vfabs_v_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VCOMPRESS __riscv_vcompress_vm_u32m8 +#define VMV_X __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0, j=0; - FLOAT minf=FLT_MAX; + BLASLONG i=0, j=0; unsigned int min_index = 0; - if (n <= 0 || inc_x <= 0) return(min_index); + if (n <= 0 || inc_x <= 0) return(min_index); + FLOAT minf=FLT_MAX; - FLOAT_V_T vx0, vx1, v_min; + FLOAT_V_T vx, vx2, v_min; UINT_V_T v_min_index; - MASK_T mask0, mask1; + MASK_T mask; unsigned int gvl = 0; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); gvl = VSETVL(n); - UINT_T temp_uint[gvl]; - v_min_index = VMVVX_UINT(0, gvl); + unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x * 2; + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = gvl * inc_x * 2; - BLASLONG ix = 0; + v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + vx = VFABS_FLOAT(vx, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + vx = VFADDVV_FLOAT(vx, vx2, gvl); - //index where element less than v_min - mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); - v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e64,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1 \n\t" - "vsetvli x0, %2, e32,m8 \n\t" - "vid.v %0, v0.t \n\t" - :"+v"(v_min_index) - :"v"(mask0), "r"(gvl) - :"v0"); -#endif -*/ - v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); + + //index where element greater than v_min + mask = VMFGTVV_FLOAT(v_min, vx, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, gvl); + v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; - ix += inc_xv; + idx += inc_v; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = VFMVFS_FLOAT(v_res); - mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); - min_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_min_index,gvl); - min_index = temp_uint[min_index]; + + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + minf = EXTRACT_FLOAT(v_res); + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); if(j < n){ gvl = VSETVL(n-j); - v_min_index = VMVVX_UINT(0, gvl); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - //fabs(vector) - mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx0) - :"v"(mask0), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - //fabs(vector) - mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); -/* -#if defined(DOUBLE) -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e64,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#else -asm volatile( - "vor.vv v0, %1, %1\n\t" - "vsetvli x0, %3, e32,m8 \n\t" - "vfrsub.vf %0, %0, %2, v0.t \n\t" - :"+v"(vx1) - :"v"(mask1), "f"(zero), "r"(gvl) - :"v0"); -#endif -*/ - v_min = VFADDVV_FLOAT(vx0, vx1, gvl); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - FLOAT cur_minf = VFMVFS_FLOAT(v_res); - if(cur_minf < minf){ + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + v_min = VFABS_FLOAT(v_min, gvl); + vx2 = VFABS_FLOAT(vx2, gvl); + v_min = VFADDVV_FLOAT(v_min, vx2, gvl); + + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); + FLOAT cur_minf = EXTRACT_FLOAT(v_res); + if(cur_minf > minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); - mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); - min_index = VMFIRSTM(mask0,gvl); - VSEVU_UINT(temp_uint,v_min_index,gvl); - min_index = temp_uint[min_index]; - + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + UINT_V_T compressed; + compressed = VCOMPRESS(v_min_index, mask, gvl); + min_index = VMV_X(compressed); } } - return(min_index+1); -} - + return(min_index+1); +} diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index 7f31e9a53..97f602e51 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=-FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_min; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_max = VFMAXVV_FLOAT(v_max, v1, gvl); j += gvl * 2; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); } for(;j maxf) - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } }else{ @@ -102,18 +111,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl * 2; idx += inc_xv * 2; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); } for(;j maxf) - maxf = *((FLOAT*)&v_res); + v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); j += gvl; } } + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index 14b7e01ed..77bf19b9d 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); if(inc_x == 1){ gvl = VSETVL(n); @@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_min = VFMINVV_FLOAT(v_min, v1, gvl); j += gvl * 2; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = *((FLOAT*)&v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); } for(;j= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_sum; + FLOAT_V_T_M1 v_res; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + + if(inc_x == 1){ + gvl = VSETVL(n); + if(gvl <= n/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - unsigned int gvl = 0; if (n < 0) return(0); + + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); + if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } + if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); if(gvl <= n/2){ for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 - +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 - +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) +#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -70,10 +75,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; @@ -84,9 +87,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); mask0 = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); + v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); mask1 = VMFLTVF_FLOAT(v1, 0, gvl); - v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); + v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); v0 = VFADDVV_FLOAT(v0, v1, gvl); v_max = VFMAXVV_FLOAT(v_max, v0, gvl); @@ -94,22 +97,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; ix += inc_xv; } - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = VFMVFS_FLOAT(v_res); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); if(j maxf) - maxf = VFMVFS_FLOAT(v_res); + v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); } + maxf = EXTRACT_FLOAT(v_res); return(maxf); } diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index d9eca7f10..095b1c3df 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -29,38 +29,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 32 +# else +# define ELEN 32 +# define MLEN 16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 8 +# else +# define ELEN 32 +# define MLEN 4 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define MASK_T JOIN(vbool, MLEN, _t, _, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) +#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -69,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; - FLOAT_V_T_M1 v_res, v_max; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; @@ -83,9 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); mask0 = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); + v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl); mask1 = VMFLTVF_FLOAT(v1, 0, gvl); - v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); + v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl); v0 = VFADDVV_FLOAT(v0, v1, gvl); v_min = VFMINVV_FLOAT(v_min, v0, gvl); @@ -93,21 +99,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; ix += inc_xv; } - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); - minf = VFMVFS_FLOAT(v_res); + v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); if(j -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m -#define VFADDVV_FLOAT vfadd_vv_f32m8 +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b32 +# else +# define ELEN 32 +# define MLEN _b16 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m -#define VFADDVV_FLOAT vfadd_vv_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b8 +# else +# define ELEN 32 +# define MLEN _b4 +# endif #endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; @@ -67,12 +73,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; - FLOAT_V_T_M1 v_res, v_z0; - gvl = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, gvl); - v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); - MASK_T mask0, mask1; if(inc_x == 1){ BLASLONG n2 = n * 2; gvl = VSETVL(n2); @@ -81,26 +84,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i N ) + n_packing >>= 1; + + BLASLONG m_packing = UNROLL_M; + BLASLONG m_top = 0; + while (m_top < M) + { + while( m_top+m_packing > M ) + m_packing >>= 1; + + BLASLONG ai = K*m_top*2; + BLASLONG bi = K*n_top*2; + + BLASLONG pass_K = K; + + + #ifdef TRMMKERNEL + #ifdef LEFT + BLASLONG off = offset + m_top; + #else + BLASLONG off = -offset + n_top; + #endif + #ifdef BACKWARDS + ai += off * m_packing*2; + bi += off * n_packing*2; + pass_K -= off; + #else + #ifdef LEFT + pass_K = off + m_packing; + #else + pass_K = off + n_packing; + #endif + #endif + #endif + + memset( res, 0, UNROLL_M*UNROLL_N*2*sizeof(FLOAT) ); + + for (BLASLONG k=0; k 0 ){ // scale change? + // find largest element in v0 and v1 + v_res = VFREDMAX( v0, v_z0, gvl ); + v_res = VFREDMAX( v1, v_res, gvl ); + FLOAT const largest_elt = EXTRACT_FLOAT( v_res ); + + v_scale = VFDIV( v_scale, largest_elt, gvl ); // scale/largest_elt + v_scale = VFMUL( v_scale, v_scale, gvl ); // (scale/largest_elt)*(scale/largest_elt) + v_ssq = VFMUL( v_scale, v_ssq, gvl ); // ssq*(scale/largest_elt)*(scale/largest_elt) + + v_scale = VFMVVF_FLOAT( largest_elt, gvl ); // splated largest_elt becomes new scale } - //ssq in vector vr: vr[0] - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); - //total ssq now - ssq += VFMVFS_FLOAT(v_res); - //tail - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); - //fabs(vector) - mask = VMFLTVF_FLOAT(v0, 0, gvl); - v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); - //if scale change - mask = VMFGTVF_FLOAT(v0, scale, gvl); - index = VMFIRSTM(mask, gvl); - if(index == -1){//no elements greater than scale - if(scale != 0.0){ - v0 = VFDIVVF_FLOAT(v0, scale, gvl); - vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); + MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl ); + MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl ); + v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl ); + v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl ); + v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl ); + v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl ); + + idx += inc_x * gvl * 2; + } + + v_res = VFREDUSUM(v_ssq, v_z0, gvl); + FLOAT ssq = EXTRACT_FLOAT(v_res); + FLOAT scale = EXTRACT_FLOAT0_V(v_scale); + + //finish any tail using scalar ops + i*=gvl; + if(i + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b32 +# else +# define ELEN 32 +# define MLEN _b16 +# endif +#else +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN _b8 +# else +# define ELEN 32 +# define MLEN _b4 +# endif +#endif + +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(0, 1); + + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n2/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 + +#ifdef RISCV64_ZVL256B +# define LMUL m2 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 64 +# else +# define ELEN 32 +# define MLEN 32 +# endif #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +# define LMUL m8 +# if defined(DOUBLE) +# define ELEN 64 +# define MLEN 16 +# else +# define ELEN 32 +# define MLEN 8 +# endif #endif +#define _ +#define JOIN2_X(x, y) x ## y +#define JOIN2(x, y) JOIN2_X(x, y) +#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) + +#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - unsigned int gvl = 0; + unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); + if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = VSETVL(n); BLASLONG n2 = n * 2; if(gvl <= n2/2){ for(i=0,j=0; i Date: Wed, 1 Mar 2023 17:40:42 +0000 Subject: [PATCH 011/718] factoring riscv64/dot.c fix into separate PR as requested --- kernel/riscv64/dot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index bf55998ca..46a84ad18 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG ix=0,iy=0; double dot = 0.0 ; - if ( n < 1 ) return(dot); + if ( n < 0 ) return(dot); while(i < n) { From 1374a2d08b078451dcfaf723614ea13e441e1d06 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Sun, 19 Mar 2023 23:59:03 -0700 Subject: [PATCH 012/718] This PR adapts latest spec changes Add prefix (_riscv) for all riscv intrinsics Update some intrinsics' parameter, like vfredxxxx, vmerge --- kernel/riscv64/amax_rvv.c | 54 +++++------ kernel/riscv64/amin_rvv.c | 54 +++++------ kernel/riscv64/asum_rvv.c | 54 +++++------ kernel/riscv64/axpby_rvv.c | 36 +++---- kernel/riscv64/axpy_rvv.c | 28 +++--- kernel/riscv64/copy_rvv.c | 24 ++--- kernel/riscv64/dot_rvv.c | 66 ++++++------- kernel/riscv64/gemm_beta_rvv.c | 24 ++--- kernel/riscv64/gemm_ncopy_8_rvv.c | 28 +++--- kernel/riscv64/gemm_ncopy_rvv_v1.c | 20 ++-- kernel/riscv64/gemm_tcopy_8_rvv.c | 44 ++++----- kernel/riscv64/gemm_tcopy_rvv_v1.c | 16 ++-- kernel/riscv64/gemmkernel_rvv_v1x8.c | 24 ++--- kernel/riscv64/gemv_n_rvv.c | 28 +++--- kernel/riscv64/gemv_t_rvv.c | 53 +++++------ kernel/riscv64/iamax_rvv.c | 115 +++++++++++----------- kernel/riscv64/iamin_rvv.c | 117 +++++++++++------------ kernel/riscv64/imax_rvv.c | 113 +++++++++++----------- kernel/riscv64/imin_rvv.c | 113 +++++++++++----------- kernel/riscv64/izamax_rvv.c | 127 ++++++++++++------------- kernel/riscv64/izamin_rvv.c | 121 ++++++++++++----------- kernel/riscv64/max_rvv.c | 50 +++++----- kernel/riscv64/min_rvv.c | 50 +++++----- kernel/riscv64/nrm2_rvv.c | 46 ++++----- kernel/riscv64/rot_rvv.c | 36 +++---- kernel/riscv64/scal_rvv.c | 32 +++---- kernel/riscv64/sum_rvv.c | 50 +++++----- kernel/riscv64/swap_rvv.c | 28 +++--- kernel/riscv64/symm_lcopy_rvv_v1.c | 50 +++++----- kernel/riscv64/symm_ucopy_rvv_v1.c | 50 +++++----- kernel/riscv64/symv_L_rvv.c | 81 ++++++++-------- kernel/riscv64/symv_U_rvv.c | 81 ++++++++-------- kernel/riscv64/trmm_lncopy_rvv_v1.c | 48 +++++----- kernel/riscv64/trmm_ltcopy_rvv_v1.c | 44 ++++----- kernel/riscv64/trmm_uncopy_rvv_v1.c | 48 +++++----- kernel/riscv64/trmm_utcopy_rvv_v1.c | 44 ++++----- kernel/riscv64/trmmkernel_rvv_v1x8.c | 28 +++--- kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 54 +++++------ kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 54 +++++------ kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 54 +++++------ kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 42 ++++---- kernel/riscv64/trsm_lncopy_rvv_v1.c | 40 ++++---- kernel/riscv64/trsm_ltcopy_rvv_v1.c | 40 ++++---- kernel/riscv64/trsm_uncopy_rvv_v1.c | 40 ++++---- kernel/riscv64/trsm_utcopy_rvv_v1.c | 40 ++++---- kernel/riscv64/zamax_rvv.c | 58 +++++------ kernel/riscv64/zamin_rvv.c | 58 +++++------ kernel/riscv64/zasum_rvv.c | 51 +++++----- kernel/riscv64/zaxpby_rvv.c | 52 +++++----- kernel/riscv64/zaxpy_rvv.c | 32 +++---- kernel/riscv64/zcopy_rvv.c | 44 ++++----- kernel/riscv64/zdot_rvv.c | 65 +++++++------ kernel/riscv64/zgemm_beta_rvv.c | 32 +++---- kernel/riscv64/zgemm_ncopy_4_rvv.c | 24 ++--- kernel/riscv64/zgemm_ncopy_rvv_v1.c | 16 ++-- kernel/riscv64/zgemm_tcopy_4_rvv.c | 40 ++++---- kernel/riscv64/zgemm_tcopy_rvv_v1.c | 16 ++-- kernel/riscv64/zgemmkernel_rvv_v1x4.c | 36 +++---- kernel/riscv64/zgemv_n_rvv.c | 48 +++++----- kernel/riscv64/zgemv_t_rvv.c | 61 ++++++------ kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 84 ++++++++-------- kernel/riscv64/zhemm_utcopy_rvv_v1.c | 84 ++++++++-------- kernel/riscv64/znrm2_rvv.c | 66 ++++++------- kernel/riscv64/zrot_rvv.c | 52 +++++----- kernel/riscv64/zscal_rvv.c | 44 ++++----- kernel/riscv64/zsum_rvv.c | 47 +++++---- kernel/riscv64/zswap_rvv.c | 24 ++--- kernel/riscv64/zsymm_lcopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/zsymm_ucopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/ztrmm_ltcopy_rvv_v1.c | 61 ++++++------ kernel/riscv64/ztrmm_uncopy_rvv_v1.c | 64 ++++++------- kernel/riscv64/ztrmm_utcopy_rvv_v1.c | 60 ++++++------ kernel/riscv64/ztrmmkernel_2x2_rvv.c | 60 ++++++------ kernel/riscv64/ztrmmkernel_rvv_v1x4.c | 40 ++++---- kernel/riscv64/ztrsm_lncopy_rvv_v1.c | 36 +++---- kernel/riscv64/ztrsm_ltcopy_rvv_v1.c | 36 +++---- kernel/riscv64/ztrsm_uncopy_rvv_v1.c | 36 +++---- kernel/riscv64/ztrsm_utcopy_rvv_v1.c | 36 +++---- 79 files changed, 2013 insertions(+), 2031 deletions(-) diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c index c9c6e7f73..be0bdbea0 100644 --- a/kernel/riscv64/amax_rvv.c +++ b/kernel/riscv64/amax_rvv.c @@ -29,33 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); return(maxf); diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c index 370b6c338..d4926084b 100644 --- a/kernel/riscv64/amin_rvv.c +++ b/kernel/riscv64/amin_rvv.c @@ -29,33 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); return(minf); diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c index 4f711c9be..691591e22 100644 --- a/kernel/riscv64/asum_rvv.c +++ b/kernel/riscv64/asum_rvv.c @@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -93,7 +93,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); asumf = VFMVFS_FLOAT_M1(v_res); return(asumf); } diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c index 7c35c563d..a1dbdb0e4 100644 --- a/kernel/riscv64/axpby_rvv.c +++ b/kernel/riscv64/axpby_rvv.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #endif int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c index 3986f4e21..8bc2f30de 100644 --- a/kernel/riscv64/axpy_rvv.c +++ b/kernel/riscv64/axpy_rvv.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c index 5d5a8bd04..041fd2dae 100644 --- a/kernel/riscv64/copy_rvv.c +++ b/kernel/riscv64/copy_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c index 60dcac2f5..3276695b6 100644 --- a/kernel/riscv64/dot_rvv.c +++ b/kernel/riscv64/dot_rvv.c @@ -37,24 +37,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( n <= 0 ) return(dot); - size_t vlmax = vsetvlmax_e64m8(); - vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax); + size_t vlmax = __riscv_vsetvlmax_e64m8(); + vfloat64m8_t vr = __riscv_vfmv_v_f_f64m8(0, vlmax); if(inc_x == 1 && inc_y == 1) { for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vle32_v_f32m4(x, vl); - vfloat32m4_t vy = vle32_v_f32m4(y, vl); + vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); + vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vle64_v_f64m8(x, vl); - vfloat64m8_t vy = vle64_v_f64m8(y, vl); + vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); + vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } @@ -63,18 +63,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG stride_y = inc_y * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vle32_v_f32m4(x, vl); - vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); + vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vle64_v_f64m8(x, vl); - vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); + vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } } else if (1 == inc_y) { @@ -82,18 +82,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG stride_x = inc_x * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); - vfloat32m4_t vy = vle32_v_f32m4(y, vl); + vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); - vfloat64m8_t vy = vle64_v_f64m8(y, vl); + vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } } else { @@ -102,25 +102,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG stride_y = inc_y * sizeof(FLOAT); for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { - vl = vsetvl_e64m8(n); + vl = __riscv_vsetvl_e64m8(n); #if !defined(DOUBLE) - vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl); - vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl); + vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); + vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); #else - vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl); - vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl); + vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); + vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); #endif } } - vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax); - vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax); - dot = vfmv_f_s_f64m1_f64(vec_sum); + vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax); + vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax); + dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum); return(dot); } diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c index 34d1ea078..f3cf6491d 100644 --- a/kernel/riscv64/gemm_beta_rvv.c +++ b/kernel/riscv64/gemm_beta_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 #endif // Optimizes the implementation in ../generic/gemm_beta.c diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c index 525b223c2..3030d67fb 100644 --- a/kernel/riscv64/gemm_ncopy_8_rvv.c +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VSEV_FLOAT vse32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VSEV_FLOAT vse64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif // Optimizes the implementation in ../generic/gemm_ncopy_8.c diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c index 2c5230752..2d6db15e5 100644 --- a/kernel/riscv64/gemm_ncopy_rvv_v1.c +++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c @@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c index 81c1f962b..080a87312 100644 --- a/kernel/riscv64/gemm_tcopy_8_rvv.c +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VLSEV_FLOAT vlse32_v_f32m1 -#define VSEV_FLOAT vse32_v_f32m1 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VLSEV_FLOAT vlse64_v_f64m1 -#define VSEV_FLOAT vse64_v_f64m1 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c index a291b70b8..c5fb6479f 100644 --- a/kernel/riscv64/gemm_tcopy_rvv_v1.c +++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c @@ -28,15 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c index 5cd509f93..471b3158f 100644 --- a/kernel/riscv64/gemmkernel_rvv_v1x8.c +++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #endif int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c index 9d2dee615..1366eb5ad 100644 --- a/kernel/riscv64/gemv_n_rvv.c +++ b/kernel/riscv64/gemv_n_rvv.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c index a80af81b6..f0c834866 100644 --- a/kernel/riscv64/gemv_t_rvv.c +++ b/kernel/riscv64/gemv_t_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -63,7 +63,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT_V_T va, vx, vr; FLOAT_V_T_M1 v_res, v_z0; size_t vlmax = VSETVL_MAX_M1; - v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL_MAX; @@ -83,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO vr = VFMACCVV_FLOAT(vr, va, vx, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); *y += alpha * VFMVFS_FLOAT_M1(v_res); y += inc_y; a += lda; @@ -107,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO vr = VFMACCVV_FLOAT(vr, va, vx, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); *y += alpha * VFMVFS_FLOAT_M1(v_res); y += inc_y; a += lda; diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c index 8b33b3bcb..ef7850a55 100644 --- a/kernel/riscv64/iamax_rvv.c +++ b/kernel/riscv64/iamax_rvv.c @@ -28,57 +28,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -106,8 +106,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -125,8 +125,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -134,16 +134,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_z0; + FLOAT_V_T_M1 v_res; v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_z0 = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); max_index = VFIRSTM(mask, vlmax); - v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); max_index = VMVVXS_UINT(v_max_index); return(max_index+1); diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c index 585b37186..56a086fed 100644 --- a/kernel/riscv64/iamin_rvv.c +++ b/kernel/riscv64/iamin_rvv.c @@ -29,57 +29,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m8 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m8 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -107,8 +107,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -126,8 +126,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -135,16 +135,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_max; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); mask = VMFLEVF_FLOAT(v_min, minf, vlmax); min_index = VFIRSTM(mask, vlmax); - v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); min_index = VMVVXS_UINT(v_min_index); return(min_index+1); diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c index d84ad968e..5b60a56f7 100644 --- a/kernel/riscv64/imax_rvv.c +++ b/kernel/riscv64/imax_rvv.c @@ -29,55 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -131,16 +131,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_min; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); max_index = VFIRSTM(mask, vlmax); - v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); max_index = VMVVXS_UINT(v_max_index); return(max_index+1); diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c index fb734f6f8..b49544a1b 100644 --- a/kernel/riscv64/imin_rvv.c +++ b/kernel/riscv64/imin_rvv.c @@ -29,55 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 -#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 -#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT vid_v_u64m8_m -#define VIDV_UINT vid_v_u64m8 -#define VADDVX_MASK_UINT vadd_vx_u64m8_m -#define VADDVX_UINT vadd_vx_u64m8 -#define VMVVX_UINT vmv_v_x_u64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m8 -#define VMVVXS_UINT vmv_x_s_u64m8_u64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_UINT __riscv_vid_v_u64m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_UINT __riscv_vadd_vx_u64m8 +#define VMVVX_UINT __riscv_vmv_v_x_u64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 #else -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define MASK_T vbool4_t -#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 -#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 -#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFIRSTM vfirst_m_b4 -#define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT vid_v_u32m8_m -#define VIDV_UINT vid_v_u32m8 -#define VADDVX_MASK_UINT vadd_vx_u32m8_m -#define VADDVX_UINT vadd_vx_u32m8 -#define VMVVX_UINT vmv_v_x_u32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m8 -#define VMVVXS_UINT vmv_x_s_u32m8_u32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFIRSTM __riscv_vfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_UINT __riscv_vid_v_u32m8 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_UINT __riscv_vadd_vx_u32m8 +#define VMVVX_UINT __riscv_vmv_v_x_u32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -131,16 +131,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_max; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); mask = VMFLEVF_FLOAT(v_min, minf, vlmax); min_index = VFIRSTM(mask, vlmax); - v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); min_index = VMVVXS_UINT(v_min_index); return(min_index+1); diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index 9cb332cbb..e61d0cbec 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -28,63 +28,63 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 -#define MASK_T vbool16_t -#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 -#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 -#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMAXVV_FLOAT vfmax_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFIRSTM vfirst_m_b16 -#define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT vid_v_u64m4_m -#define VIDV_UINT vid_v_u64m4 -#define VADDVX_MASK_UINT vadd_vx_u64m4_m -#define VADDVX_UINT vadd_vx_u64m4 -#define VMVVX_UINT vmv_v_x_u64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 -#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 #else -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 -#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 -#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMAXVV_FLOAT vfmax_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT vid_v_u32m4_m -#define VIDV_UINT vid_v_u32m4 -#define VADDVX_MASK_UINT vadd_vx_u32m4_m -#define VADDVX_UINT vadd_vx_u32m4 -#define VMVVX_UINT vmv_v_x_u32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 -#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -116,8 +116,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); @@ -138,24 +138,23 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); } } - FLOAT_V_T_M1 v_res, v_z0; + FLOAT_V_T_M1 v_res; v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_z0 = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax); + v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); max_index = VFIRSTM(mask, vlmax); - v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax); + v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); max_index = VMVVXS_UINT(v_max_index); return(max_index+1); diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index 69771e5aa..297b3c99a 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -29,59 +29,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 -#define MASK_T vbool16_t -#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 -#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16 -#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMINVV_FLOAT vfmin_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFIRSTM vfirst_m_b16 -#define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT vid_v_u64m4_m -#define VIDV_UINT vid_v_u64m4 -#define VADDVX_MASK_UINT vadd_vx_u64m4_m -#define VADDVX_UINT vadd_vx_u64m4 -#define VMVVX_UINT vmv_v_x_u64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VSLIDEDOWN_UINT vslidedown_vx_u64m4 -#define VMVVXS_UINT vmv_x_s_u64m4_u64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 +#define MASK_T vbool16_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m4_b16 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFIRSTM __riscv_vfirst_m_b16 +#define UINT_V_T vuint64m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_UINT __riscv_vid_v_u64m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_UINT __riscv_vadd_vx_u64m4 +#define VMVVX_UINT __riscv_vmv_v_x_u64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 #else -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 -#define MASK_T vbool8_t -#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 -#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8 -#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMINVV_FLOAT vfmin_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFIRSTM vfirst_m_b8 -#define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT vid_v_u32m4_m -#define VIDV_UINT vid_v_u32m4 -#define VADDVX_MASK_UINT vadd_vx_u32m4_m -#define VADDVX_UINT vadd_vx_u32m4 -#define VMVVX_UINT vmv_v_x_u32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VSLIDEDOWN_UINT vslidedown_vx_u32m4 -#define VMVVXS_UINT vmv_x_s_u32m4_u32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 +#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 +#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m4_b8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFIRSTM __riscv_vfirst_m_b8 +#define UINT_V_T vuint32m4_t +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_UINT __riscv_vid_v_u32m4 +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_UINT __riscv_vadd_vx_u32m4 +#define VMVVX_UINT __riscv_vmv_v_x_u32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 +#define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 #endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -113,8 +113,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); @@ -136,8 +136,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); @@ -145,16 +145,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_res, v_max; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); + FLOAT_V_T_M1 v_res; + v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); - v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax); + v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); mask = VMFLEVF_FLOAT(v_min, minf, vlmax); min_index = VFIRSTM(mask, vlmax); - v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax); + v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); min_index = VMVVXS_UINT(v_min_index); return(min_index+1); diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c index 5b1380d2b..9315321f4 100644 --- a/kernel/riscv64/max_rvv.c +++ b/kernel/riscv64/max_rvv.c @@ -29,31 +29,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); return(maxf); diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c index bddcc0ba7..158b682fd 100644 --- a/kernel/riscv64/min_rvv.c +++ b/kernel/riscv64/min_rvv.c @@ -29,31 +29,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m8 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); return(minf); diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 979c31648..42abfa119 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -29,30 +29,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #define ABS fabsf #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #define ABS fabs #endif @@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } - v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); ssq = VFMVFS_FLOAT_M1(v_res); diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c index 7bf5e4270..90f81d5e2 100644 --- a/kernel/riscv64/rot_rvv.c +++ b/kernel/riscv64/rot_rvv.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMSACVF_FLOAT vfmsac_vf_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMSACVF_FLOAT vfmsac_vf_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c index d2c0378bf..2e2cfd31e 100644 --- a/kernel/riscv64/scal_rvv.c +++ b/kernel/riscv64/scal_rvv.c @@ -28,23 +28,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c index 1db0d09dd..9715faf22 100644 --- a/kernel/riscv64/sum_rvv.c +++ b/kernel/riscv64/sum_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -89,7 +89,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); + v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); sumf = VFMVFS_FLOAT_M1(v_res); return(sumf); } diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c index 2cf92f6ad..893d70554 100644 --- a/kernel/riscv64/swap_rvv.c +++ b/kernel/riscv64/swap_rvv.c @@ -28,23 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c index f0def9617..a615db44d 100644 --- a/kernel/riscv64/symm_lcopy_rvv_v1.c +++ b/kernel/riscv64/symm_lcopy_rvv_v1.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif // Optimizes the implementation in ../generic/symm_lcopy_4.c @@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); VSEV_FLOAT(b, vb, vl); b += vl; diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c index 958506df3..464f97b3a 100644 --- a/kernel/riscv64/symm_ucopy_rvv_v1.c +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif // Optimizes the implementation in ../generic/symm_ucopy_4.c @@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb = VMERGE_VVM_FLOAT(vbool, va2, va1, vl); + vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); VSEV_FLOAT(b, vb, vl); b += vl; diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c index 737abaae3..e87ab22ae 100644 --- a/kernel/riscv64/symv_L_rvv.c +++ b/kernel/riscv64/symv_L_rvv.c @@ -28,43 +28,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T_M1 vfloat32m1_t -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMSACVF_FLOAT vfmsac_vf_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T_M1 vfloat64m1_t -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMSACVF_FLOAT vfmsac_vf_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -77,7 +77,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT_V_T_M1 v_res, v_z0; size_t vlmax = VSETVL_MAX_M1, vl; - v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL_MAX; @@ -105,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vr = VFMACCVV_FLOAT(vr, vx, va, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[j] += alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -137,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); jy += inc_y; @@ -172,7 +171,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[j] += alpha * VFMVFS_FLOAT_M1(v_res); jx += inc_x; @@ -211,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); jx += inc_x; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index cb923be5d..3fbc33c89 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -29,43 +29,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T_M1 vfloat32m1_t -#define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VSEV_FLOAT vse32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VSSEV_FLOAT vsse32_v_f32m8 -#define VFMACCVV_FLOAT vfmacc_vv_f32m8 -#define VFMACCVF_FLOAT vfmacc_vf_f32m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m8 -#define VFMULVF_FLOAT vfmul_vf_f32m8 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMSACVF_FLOAT vfmsac_vf_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T_M1 vfloat64m1_t -#define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VSEV_FLOAT vse64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VSSEV_FLOAT vsse64_v_f64m8 -#define VFMACCVV_FLOAT vfmacc_vv_f64m8 -#define VFMACCVF_FLOAT vfmacc_vf_f64m8 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m8 -#define VFMULVF_FLOAT vfmul_vf_f64m8 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMSACVF_FLOAT vfmsac_vf_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -77,7 +77,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT *a_ptr = a; FLOAT_V_T_M1 v_res, v_z0; size_t vl_max = VSETVL_MAX_M1, vl; - v_res = VFMVVF_FLOAT_M1(0, vl_max); v_z0 = VFMVVF_FLOAT_M1(0, vl_max); vl_max = VSETVL_MAX; @@ -105,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], vl); vr = VFMACCVV_FLOAT(vr, vx, va, vl); } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -137,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -171,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; @@ -209,7 +208,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max); + v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); a_ptr += lda; diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c index 3457ca3e1..4135a9b62 100644 --- a/kernel/riscv64/trmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c @@ -30,29 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c @@ -116,10 +116,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLSEV_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao++; diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c index 2fe8cf79e..580714fde 100644 --- a/kernel/riscv64/trmm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c @@ -111,10 +111,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLEV_FLOAT(ao, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao += lda; diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c index b64cd840d..852ab7f11 100644 --- a/kernel/riscv64/trmm_uncopy_rvv_v1.c +++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c @@ -30,29 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c @@ -114,10 +114,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLSEV_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao++; diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c index b96daae5b..e0b6d362d 100644 --- a/kernel/riscv64/trmm_utcopy_rvv_v1.c +++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c @@ -32,27 +32,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif // Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c @@ -113,10 +113,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { va1 = VLEV_FLOAT(ao, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - vb = VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl); + vb = VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl); #endif VSEV_FLOAT(b, vb, vl); ao += lda; diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c index 97b14650c..393b24bce 100644 --- a/kernel/riscv64/trmmkernel_rvv_v1x8.c +++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c index 2cba06b38..886af0c3b 100644 --- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c index 492a5631f..ddeef966c 100644 --- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c index 4751ae012..4c83bbaa3 100644 --- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSSEV_FLOAT vsse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSSEV_FLOAT vsse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif static FLOAT dm1 = -1.; diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c index 93a9e6916..b368eefb9 100644 --- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -28,28 +28,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c index bacfb2b08..41c84be25 100644 --- a/kernel/riscv64/trsm_lncopy_rvv_v1.c +++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c @@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c index 0fc7c9f24..003bd3465 100644 --- a/kernel/riscv64/trsm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c @@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif #ifndef UNIT diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c index ee869a795..6cca5d49c 100644 --- a/kernel/riscv64/trsm_uncopy_rvv_v1.c +++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c index a324b0fa6..bc058525f 100644 --- a/kernel/riscv64/trsm_utcopy_rvv_v1.c +++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c @@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSEV_FLOAT_M vse32_v_f32m2_m -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_M __riscv_vse32_v_f32m2_m +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSEV_FLOAT_M vse64_v_f64m2_m -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VSEV_FLOAT_M __riscv_vse64_v_f64m2_m +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c index 1917042be..615b7519c 100644 --- a/kernel/riscv64/zamax_rvv.c +++ b/kernel/riscv64/zamax_rvv.c @@ -29,35 +29,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT vfmax_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT vfmax_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -106,7 +106,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax); + v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); maxf = VFMVFS_FLOAT_M1(v_res); return(maxf); diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c index 3f027383a..a0d36d46f 100644 --- a/kernel/riscv64/zamin_rvv.c +++ b/kernel/riscv64/zamin_rvv.c @@ -29,35 +29,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMINVV_FLOAT vfmin_vv_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFABSV_FLOAT vfabs_v_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMINVV_FLOAT vfmin_vv_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFABSV_FLOAT vfabs_v_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -105,7 +105,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax); + v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); minf = VFMVFS_FLOAT_M1(v_res); return(minf); diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c index 7876646b3..1d2f0e1fe 100644 --- a/kernel/riscv64/zasum_rvv.c +++ b/kernel/riscv64/zasum_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m8(n) -#define VSETVL_MAX vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m8 -#define VLSEV_FLOAT vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT vfadd_vv_f32m8 -#define VFABSV_FLOAT vfabs_v_f32m8 +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #else -#define VSETVL(n) vsetvl_e64m8(n) -#define VSETVL_MAX vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m8 -#define VLSEV_FLOAT vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT vfadd_vv_f64m8 -#define VFABSV_FLOAT vfabs_v_f64m8 +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -99,9 +99,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax); asumf += VFMVFS_FLOAT_M1(v_res); return(asumf); diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c index 66f52d9d0..e0da55311 100644 --- a/kernel/riscv64/zaxpby_rvv.c +++ b/kernel/riscv64/zaxpby_rvv.c @@ -33,33 +33,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFMSACVF_FLOAT vfmsac_vf_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFMSACVF_FLOAT vfmsac_vf_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c index 777bcb728..3f75898e0 100644 --- a/kernel/riscv64/zaxpy_rvv.c +++ b/kernel/riscv64/zaxpy_rvv.c @@ -28,23 +28,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c index 5d8322bbb..bd94810ce 100644 --- a/kernel/riscv64/zcopy_rvv.c +++ b/kernel/riscv64/zcopy_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL_M8(n) vsetvl_e32m8(n) -#define FLOAT_V_T_M8 vfloat32m8_t -#define VLEV_FLOAT_M8 vle32_v_f32m8 -#define VSEV_FLOAT_M8 vse32_v_f32m8 - -#define VSETVL_M4(n) vsetvl_e32m4(n) -#define FLOAT_V_T_M4 vfloat32m4_t -#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4 -#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4 +#define VSETVL_M8(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T_M8 vfloat32m8_t +#define VLEV_FLOAT_M8 __riscv_vle32_v_f32m8 +#define VSEV_FLOAT_M8 __riscv_vse32_v_f32m8 + +#define VSETVL_M4(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T_M4 vfloat32m4_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4 #else -#define VSETVL_M8(n) vsetvl_e64m8(n) -#define FLOAT_V_T_M8 vfloat64m8_t -#define VLEV_FLOAT_M8 vle64_v_f64m8 -#define VSEV_FLOAT_M8 vse64_v_f64m8 - -#define VSETVL_M4(n) vsetvl_e64m4(n) -#define FLOAT_V_T_M4 vfloat64m4_t -#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4 -#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4 +#define VSETVL_M8(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T_M8 vfloat64m8_t +#define VLEV_FLOAT_M8 __riscv_vle64_v_f64m8 +#define VSEV_FLOAT_M8 __riscv_vse64_v_f64m8 + +#define VSETVL_M4(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T_M4 vfloat64m4_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c index 7eae6f608..1543c513d 100644 --- a/kernel/riscv64/zdot_rvv.c +++ b/kernel/riscv64/zdot_rvv.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFMSACVV_FLOAT vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFMSACVV_FLOAT vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) @@ -72,7 +72,6 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; FLOAT_V_T_M1 v_res, v_z0; size_t vlmax_m1 = VSETVL_MAX_M1; - v_res = VFMVVF_FLOAT_M1(0, vlmax_m1); v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); size_t vlmax = VSETVL_MAX; @@ -161,9 +160,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA } } - v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr0, v_z0, vlmax); CREAL(result) = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr1, v_z0, vlmax); CIMAG(result) = VFMVFS_FLOAT_M1(v_res); return(result); diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c index a89752d18..b94b5f4bf 100644 --- a/kernel/riscv64/zgemm_beta_rvv.c +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -39,23 +39,23 @@ #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFADDVV_FLOAT vfadd_vv_f32m4 -#define VFSUBVV_FLOAT vfsub_vv_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFSUBVV_FLOAT __riscv_vfsub_vv_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFADDVV_FLOAT vfadd_vv_f64m4 -#define VFSUBVV_FLOAT vfsub_vv_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFSUBVV_FLOAT __riscv_vfsub_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c index 389ee5d57..d50a4b8d5 100644 --- a/kernel/riscv64/zgemm_ncopy_4_rvv.c +++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif // Optimizes the implementation in ../generic/zgemm_ncopy_4.c diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c index df039bab6..1d3b8d3b7 100644 --- a/kernel/riscv64/zgemm_ncopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -29,15 +29,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c index 1b34039c8..8c35b5616 100644 --- a/kernel/riscv64/zgemm_tcopy_4_rvv.c +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VSEV_FLOAT vse32_v_f32m1 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1 -#define VSSEG2_FLOAT vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT vsseg8e32_v_f32m1 +#define VSETVL(n) __riscv_vsetvl_e32m1(n) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 #else -#define VSETVL(n) vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VSEV_FLOAT vse64_v_f64m1 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1 -#define VSSEG2_FLOAT vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT vsseg8e64_v_f64m1 +#define VSETVL(n) __riscv_vsetvl_e64m1(n) +#define FLOAT_V_T vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m1 +#define VSEV_FLOAT __riscv_vse64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c index 7622fb810..7a085269c 100644 --- a/kernel/riscv64/zgemm_tcopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -28,15 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c index 50e29222f..41399cf79 100644 --- a/kernel/riscv64/zgemmkernel_rvv_v1x4.c +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c index 2eeb61b45..4a40c30a7 100644 --- a/kernel/riscv64/zgemv_n_rvv.c +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c index b682d5cd8..15795cc3a 100644 --- a/kernel/riscv64/zgemv_t_rvv.c +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMULVV_FLOAT vfmul_vv_f32m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMULVV_FLOAT vfmul_vv_f64m4 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -73,7 +73,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2 = lda * 2; size_t vlmax = VSETVL_MAX_M1; - v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL(m); @@ -105,9 +104,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) @@ -149,9 +148,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c index cf466d3fa..79b20a646 100644 --- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -28,45 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VMSLT_VX_INT vmslt_vx_i32m2_b16 -#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT __riscv_vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VMSLT_VX_INT vmslt_vx_i64m2_b32 -#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT __riscv_vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #endif @@ -104,13 +104,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vbool_lt0 = VMSLT_VX_INT(vindex, 0, vl); vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl); vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_lt0, vb1, vb2, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c index 6209f5417..a86815275 100644 --- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -28,45 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VMSLT_VX_INT vmslt_vx_i32m2_b16 -#define VMSEQ_VX_INT vmseq_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VMSLT_VX_INT __riscv_vmslt_vx_i32m2_b16 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VMSLT_VX_INT vmslt_vx_i64m2_b32 -#define VMSEQ_VX_INT vmseq_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VMSLT_VX_INT __riscv_vmslt_vx_i64m2_b32 +#define VMSEQ_VX_INT __riscv_vmseq_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #endif @@ -101,13 +101,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); vbool_eq0 = VMSEQ_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl); vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_gt0, vb1, vb2, vl); - vb1 = VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl); + vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c index 921ddb8cb..5f7873b5a 100644 --- a/kernel/riscv64/znrm2_rvv.c +++ b/kernel/riscv64/znrm2_rvv.c @@ -28,35 +28,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT vfabs_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT vfabs_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #endif // TODO: Should single precision use the widening MAC, or perhaps all should be double? @@ -85,10 +85,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); vr = VFMACCVV_FLOAT(vr, v1, v1, vl); } @@ -103,16 +103,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); vr = VFMACCVV_FLOAT(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl); + v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); vr = VFMACCVV_FLOAT(vr, v1, v1, vl); } } - v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax); + v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); ssq = VFMVFS_FLOAT_M1(v_res); scale = VFMVFS_FLOAT_M1(v_max); diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c index 68066a00b..ee81bfe91 100644 --- a/kernel/riscv64/zrot_rvv.c +++ b/kernel/riscv64/zrot_rvv.c @@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEV_FLOAT vlse32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEV_FLOAT vsse32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEV_FLOAT vlse64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEV_FLOAT vsse64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 079c36a2d..779fab68c 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMULVF_FLOAT vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMULVF_FLOAT vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c index 3928fbe27..44df112c6 100644 --- a/kernel/riscv64/zsum_rvv.c +++ b/kernel/riscv64/zsum_rvv.c @@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT vfadd_vv_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT vfadd_vv_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -88,9 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } - FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax); FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax); + v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax); sumf += VFMVFS_FLOAT_M1(v_res); return(sumf); diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c index 86f9103d3..17b7b9f43 100644 --- a/kernel/riscv64/zswap_rvv.c +++ b/kernel/riscv64/zswap_rvv.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT vssseg2e32_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 #else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT vssseg2e64_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c index df5c916a5..0f9e04869 100644 --- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) @@ -91,8 +91,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c index dcf2b081a..fdc693700 100644 --- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define INT_V_T vint32m2_t -#define VID_V_INT vid_v_i32m2 -#define VADD_VX_INT vadd_vx_i32m2 -#define VMSGT_VX_INT vmsgt_vx_i32m2_b16 -#define VBOOL_T vbool16_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define INT_V_T vint32m2_t +#define VID_V_INT __riscv_vid_v_i32m2 +#define VADD_VX_INT __riscv_vadd_vx_i32m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 +#define VBOOL_T vbool16_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define INT_V_T vint64m2_t -#define VID_V_INT vid_v_i64m2 -#define VADD_VX_INT vadd_vx_i64m2 -#define VMSGT_VX_INT vmsgt_vx_i64m2_b32 -#define VBOOL_T vbool32_t -#define VMERGE_VVM_FLOAT vmerge_vvm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define INT_V_T vint64m2_t +#define VID_V_INT __riscv_vid_v_i64m2 +#define VADD_VX_INT __riscv_vadd_vx_i64m2 +#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 +#define VBOOL_T vbool32_t +#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #endif @@ -92,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); - vb0 = VMERGE_VVM_FLOAT(vbool, va20, va10, vl); - vb1 = VMERGE_VVM_FLOAT(vbool, va21, va11, vl); + vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); + vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); VSSEG2_FLOAT(b, vb0, vb1, vl); b += vl * 2; diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c index afd694408..7276618c5 100644 --- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -30,35 +30,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vint32m2_t -#define VID_V_UINT vid_v_i32m2 -#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vint32m2_t +#define VID_V_UINT __riscv_vid_v_i32m2 +#define VMSGTU_VX_UINT __riscv_vmsgt_vx_i32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_i32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -121,12 +121,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif VSSEG2_FLOAT(b, va0, va1, vl); ao += 2; diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c index c7d593949..72e8f2ce2 100644 --- a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -30,33 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -117,14 +117,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON //va1 = VLEV_FLOAT(ao, vl); VLSEG2_FLOAT(&va0, &va1, ao, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - //VSEV_FLOAT(b, vb, vl); VSSEG2_FLOAT(b, va0, va1, vl); ao += lda * 2; b += vl * 2; diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c index 3c70b6385..e6d36c86d 100644 --- a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -30,35 +30,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEV_FLOAT vlse32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEV_FLOAT vlse64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -120,12 +120,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif VSSEG2_FLOAT(b, va0, va1, vl); ao += 2; diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c index 706782cf0..7085cfc37 100644 --- a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -32,33 +32,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 -#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u32m2_b16 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 -#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32 -#define VFMERGE_VFM_FLOAT vfmerge_vfm_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 +#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 +#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -117,12 +117,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { VLSEG2_FLOAT(&va0, &va1, ao, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); #ifdef UNIT vbool_eq = VMSEQ_VX_UINT(vindex, j, vl); - va0 = VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl); - va1 = VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl); + va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); + va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif VSSEG2_FLOAT(b, va0, va1, vl); ao += lda * 2; diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c index 3486a4648..399124d2e 100644 --- a/kernel/riscv64/ztrmmkernel_2x2_rvv.c +++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c @@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m2_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEG4_FLOAT vlseg4e32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMACCVV_FLOAT vfmacc_vv_f32m2 -#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m2() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLSEG4_FLOAT __riscv_vlseg4e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m2_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m2_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEG4_FLOAT vlseg4e64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMACCVV_FLOAT vfmacc_vv_f64m2 -#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m2() +#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m2_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VLSEG4_FLOAT __riscv_vlseg4e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m2_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif // Optimizes the implementation in ../generic/ztrmmkernel_2x2.c diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c index 27409ec25..92b4b855b 100644 --- a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2 -#define VFMULVF_FLOAT vfmul_vf_f32m2 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2 -#define VFMULVF_FLOAT vfmul_vf_f64m2 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m2 #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c index b7ccb1eb3..383cb883f 100644 --- a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c index 911b81de5..f57e9f1de 100644 --- a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c index db075c29b..be3613429 100644 --- a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -30,25 +30,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #endif diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c index e121c6273..b1f5ef8f0 100644 --- a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m -#define VBOOL_T vbool16_t -#define UINT_V_T vuint32m2_t -#define VID_V_UINT vid_v_u32m2 -#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16 +#define VSETVL(n) __riscv_vsetvl_e32m2(n) +#define FLOAT_V_T vfloat32m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define VBOOL_T vbool16_t +#define UINT_V_T vuint32m2_t +#define VID_V_UINT __riscv_vid_v_u32m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m -#define VBOOL_T vbool32_t -#define UINT_V_T vuint64m2_t -#define VID_V_UINT vid_v_u64m2 -#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32 +#define VSETVL(n) __riscv_vsetvl_e64m2(n) +#define FLOAT_V_T vfloat64m2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define VBOOL_T vbool32_t +#define UINT_V_T vuint64m2_t +#define VID_V_UINT __riscv_vid_v_u64m2 +#define VMSLTU_VX_UINT __riscv_vmsltu_vx_u64m2_b32 #endif From 6b74bee2f9d7272f1932a9ba9bbd1bda6c122fbf Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Mon, 27 Mar 2023 18:59:24 -0700 Subject: [PATCH 013/718] Update TARGET=x280 description. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6ecb46178..1f1c0f3ed 100644 --- a/README.md +++ b/README.md @@ -186,7 +186,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ``` (also known to work on C906) -- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0. +- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0. ```sh make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran ``` From 281e834566a06f1c756d262dc31e809faaf8933f Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Thu, 30 Mar 2023 15:15:25 -0400 Subject: [PATCH 014/718] do not pass -j flag to the MAKE variable --- getarch.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/getarch.c b/getarch.c index 937a8db68..87384c084 100644 --- a/getarch.c +++ b/getarch.c @@ -1930,15 +1930,15 @@ printf("ELF_VERSION=2\n"); #ifdef MAKE_NB_JOBS #if MAKE_NB_JOBS > 0 - printf("MAKE += -j %d\n", MAKE_NB_JOBS); + printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS); #else // Let make use parent -j argument or -j1 if there // is no make parent #endif #elif NO_PARALLEL_MAKE==1 - printf("MAKE += -j 1\n"); + printf("MAKEFLAGS += -j 1\n"); #else - printf("MAKE += -j %d\n", get_num_cores()); + printf("MAKEFLAGS += -j %d\n", get_num_cores()); #endif break; From 397108fba299c87ce17957452d57469af914f516 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Fri, 31 Mar 2023 09:22:40 -0400 Subject: [PATCH 015/718] serialize shared prerequisites --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 144b3400d..3c4b8948a 100644 --- a/Makefile +++ b/Makefile @@ -40,9 +40,9 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test .PHONY : all libs netlib $(RELA) test ctest shared install -.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test +.NOTPARALLEL : shared -all :: libs netlib $(RELA) tests shared +all :: tests @echo @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" @echo @@ -150,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT) endif endif -tests : libs netlib $(RELA) shared +tests : shared ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS From 3e8f51e7cf49903216e8f92ff6c9cdf7bcf4886f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Apr 2023 22:25:07 +0200 Subject: [PATCH 016/718] Update version to 0.3.23.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 502bf7a9d..35fd830ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 23) +set(OpenBLAS_PATCH_VERSION 23.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 516f22b8ca61ff9976afa7cf07c79ebca9b94fa8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Apr 2023 22:25:55 +0200 Subject: [PATCH 017/718] Update version to 0.3.23.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index ab46fd075..e210e49e8 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.23 +VERSION = 0.3.23.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 65b7bf9f3e3437e8887630e477c2abd749a7f0ac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 Apr 2023 10:51:38 +0200 Subject: [PATCH 018/718] Add Apple M1 testing via Cirrus CI --- cirrus.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 cirrus.yml diff --git a/cirrus.yml b/cirrus.yml new file mode 100644 index 000000000..d16eb811a --- /dev/null +++ b/cirrus.yml @@ -0,0 +1,10 @@ +macos_instance: + image: ghcr.io/cirruslabs/macos-monterey-xcode:latest + +task: + compile_script: + - brew install llvm + - export PATH=/opt/homebrew/opt/llvm/bin:$PATH + - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - make TARGET=VORTEX USE_OPENMP=1 CC=clang From 5f1fb27c40baa7b2ae105baf933cb6a86670aac2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 Apr 2023 11:00:17 +0200 Subject: [PATCH 019/718] Rename cirrus.yml to .cirrus.yml --- cirrus.yml => .cirrus.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cirrus.yml => .cirrus.yml (100%) diff --git a/cirrus.yml b/.cirrus.yml similarity index 100% rename from cirrus.yml rename to .cirrus.yml From d175b8f56fc770d888f515b1a79417ec6ca9ff85 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 Apr 2023 15:02:10 +0200 Subject: [PATCH 020/718] Refactor ?GEBAL (Reference-LAPACK PR 808) --- lapack-netlib/SRC/cgebal.f | 312 ++++++++++++++++++++----------------- lapack-netlib/SRC/dgebal.f | 300 ++++++++++++++++++----------------- lapack-netlib/SRC/sgebal.f | 307 +++++++++++++++++++----------------- lapack-netlib/SRC/zgebal.f | 309 +++++++++++++++++++----------------- 4 files changed, 654 insertions(+), 574 deletions(-) diff --git a/lapack-netlib/SRC/cgebal.f b/lapack-netlib/SRC/cgebal.f index 5d1ebb026..3f54d3937 100644 --- a/lapack-netlib/SRC/cgebal.f +++ b/lapack-netlib/SRC/cgebal.f @@ -85,6 +85,7 @@ *> \verbatim *> ILO is INTEGER *> \endverbatim +*> *> \param[out] IHI *> \verbatim *> IHI is INTEGER @@ -154,6 +155,9 @@ *> *> Modified by Tzu-Yi Chen, Computer Science Division, University of *> California at Berkeley, USA +*> +*> Refactored by Evert Provoost, Department of Computer Science, +*> KU Leuven, Belgium *> \endverbatim *> * ===================================================================== @@ -183,8 +187,8 @@ PARAMETER ( FACTOR = 0.95E+0 ) * .. * .. Local Scalars .. - LOGICAL NOCONV - INTEGER I, ICA, IEXC, IRA, J, K, L, M + LOGICAL NOCONV, CANSWAP + INTEGER I, ICA, IRA, J, K, L REAL C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1, $ SFMIN2 * .. @@ -195,10 +199,10 @@ EXTERNAL SISNAN, LSAME, ICAMAX, SLAMCH, SCNRM2 * .. * .. External Subroutines .. - EXTERNAL CSSCAL, CSWAP, XERBLA + EXTERNAL XERBLA, CSSCAL, CSWAP * .. * .. Intrinsic Functions .. - INTRINSIC ABS, AIMAG, MAX, MIN, REAL + INTRINSIC ABS, REAL, AIMAG, MAX, MIN * * Test the input parameters * @@ -216,176 +220,194 @@ RETURN END IF * - K = 1 - L = N +* Quick returns. * - IF( N.EQ.0 ) - $ GO TO 210 + IF( N.EQ.0 ) THEN + ILO = 1 + IHI = 0 + RETURN + END IF * IF( LSAME( JOB, 'N' ) ) THEN - DO 10 I = 1, N + DO I = 1, N SCALE( I ) = ONE - 10 CONTINUE - GO TO 210 + END DO + ILO = 1 + IHI = N + RETURN END IF * - IF( LSAME( JOB, 'S' ) ) - $ GO TO 120 -* -* Permutation to isolate eigenvalues if possible -* - GO TO 50 -* -* Row and column exchange. -* - 20 CONTINUE - SCALE( M ) = J - IF( J.EQ.M ) - $ GO TO 30 -* - CALL CSWAP( L, A( 1, J ), 1, A( 1, M ), 1 ) - CALL CSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA ) -* - 30 CONTINUE - GO TO ( 40, 80 )IEXC -* -* Search for rows isolating an eigenvalue and push them down. -* - 40 CONTINUE - IF( L.EQ.1 ) - $ GO TO 210 - L = L - 1 -* - 50 CONTINUE - DO 70 J = L, 1, -1 +* Permutation to isolate eigenvalues if possible. * - DO 60 I = 1, L - IF( I.EQ.J ) - $ GO TO 60 - IF( REAL( A( J, I ) ).NE.ZERO .OR. AIMAG( A( J, I ) ).NE. - $ ZERO )GO TO 70 - 60 CONTINUE -* - M = L - IEXC = 1 - GO TO 20 - 70 CONTINUE -* - GO TO 90 + K = 1 + L = N * -* Search for columns isolating an eigenvalue and push them left. + IF( .NOT.LSAME( JOB, 'S' ) ) THEN * - 80 CONTINUE - K = K + 1 +* Row and column exchange. * - 90 CONTINUE - DO 110 J = K, L + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for rows isolating an eigenvalue and push them down. +* + NOCONV = .FALSE. + DO I = L, 1, -1 + CANSWAP = .TRUE. + DO J = 1, L + IF( I.NE.J .AND. ( REAL( A( I, J ) ).NE.ZERO .OR. + $ AIMAG( A( I, J ) ).NE.ZERO ) ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( L ) = I + IF( I.NE.L ) THEN + CALL CSWAP( L, A( 1, I ), 1, A( 1, L ), 1 ) + CALL CSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA ) + END IF + NOCONV = .TRUE. +* + IF( L.EQ.1 ) THEN + ILO = 1 + IHI = 1 + RETURN + END IF +* + L = L - 1 + END IF + END DO +* + END DO + + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for columns isolating an eigenvalue and push them left. +* + NOCONV = .FALSE. + DO J = K, L + CANSWAP = .TRUE. + DO I = K, L + IF( I.NE.J .AND. ( REAL( A( I, J ) ).NE.ZERO .OR. + $ AIMAG( A( I, J ) ).NE.ZERO ) ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( K ) = J + IF( J.NE.K ) THEN + CALL CSWAP( L, A( 1, J ), 1, A( 1, K ), 1 ) + CALL CSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA ) + END IF + NOCONV = .TRUE. +* + K = K + 1 + END IF + END DO +* + END DO * - DO 100 I = K, L - IF( I.EQ.J ) - $ GO TO 100 - IF( REAL( A( I, J ) ).NE.ZERO .OR. AIMAG( A( I, J ) ).NE. - $ ZERO )GO TO 110 - 100 CONTINUE + END IF * - M = K - IEXC = 2 - GO TO 20 - 110 CONTINUE +* Initialize SCALE for non-permuted submatrix. * - 120 CONTINUE - DO 130 I = K, L + DO I = K, L SCALE( I ) = ONE - 130 CONTINUE + END DO * - IF( LSAME( JOB, 'P' ) ) - $ GO TO 210 +* If we only had to permute, we are done. +* + IF( LSAME( JOB, 'P' ) ) THEN + ILO = K + IHI = L + RETURN + END IF * * Balance the submatrix in rows K to L. * -* Iterative loop for norm reduction +* Iterative loop for norm reduction. * SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' ) SFMAX1 = ONE / SFMIN1 SFMIN2 = SFMIN1*SCLFAC SFMAX2 = ONE / SFMIN2 - 140 CONTINUE - NOCONV = .FALSE. -* - DO 200 I = K, L -* - C = SCNRM2( L-K+1, A( K, I ), 1 ) - R = SCNRM2( L-K+1, A( I , K ), LDA ) - ICA = ICAMAX( L, A( 1, I ), 1 ) - CA = ABS( A( ICA, I ) ) - IRA = ICAMAX( N-K+1, A( I, K ), LDA ) - RA = ABS( A( I, IRA+K-1 ) ) -* -* Guard against zero C or R due to underflow. -* - IF( C.EQ.ZERO .OR. R.EQ.ZERO ) - $ GO TO 200 - G = R / SCLFAC - F = ONE - S = C + R - 160 CONTINUE - IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR. - $ MIN( R, G, RA ).LE.SFMIN2 )GO TO 170 - IF( SISNAN( C+F+CA+R+G+RA ) ) THEN * -* Exit if NaN to avoid infinite loop + NOCONV = .TRUE. + DO WHILE( NOCONV ) + NOCONV = .FALSE. * - INFO = -3 - CALL XERBLA( 'CGEBAL', -INFO ) - RETURN - END IF - F = F*SCLFAC - C = C*SCLFAC - CA = CA*SCLFAC - R = R / SCLFAC - G = G / SCLFAC - RA = RA / SCLFAC - GO TO 160 -* - 170 CONTINUE - G = C / SCLFAC - 180 CONTINUE - IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR. - $ MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190 - F = F / SCLFAC - C = C / SCLFAC - G = G / SCLFAC - CA = CA / SCLFAC - R = R*SCLFAC - RA = RA*SCLFAC - GO TO 180 -* -* Now balance. -* - 190 CONTINUE - IF( ( C+R ).GE.FACTOR*S ) - $ GO TO 200 - IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN - IF( F*SCALE( I ).LE.SFMIN1 ) - $ GO TO 200 - END IF - IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN - IF( SCALE( I ).GE.SFMAX1 / F ) - $ GO TO 200 - END IF - G = ONE / F - SCALE( I ) = SCALE( I )*F - NOCONV = .TRUE. + DO I = K, L * - CALL CSSCAL( N-K+1, G, A( I, K ), LDA ) - CALL CSSCAL( L, F, A( 1, I ), 1 ) + C = SCNRM2( L-K+1, A( K, I ), 1 ) + R = SCNRM2( L-K+1, A( I, K ), LDA ) + ICA = ICAMAX( L, A( 1, I ), 1 ) + CA = ABS( A( ICA, I ) ) + IRA = ICAMAX( N-K+1, A( I, K ), LDA ) + RA = ABS( A( I, IRA+K-1 ) ) * - 200 CONTINUE +* Guard against zero C or R due to underflow. +* + IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE +* +* Exit if NaN to avoid infinite loop * - IF( NOCONV ) - $ GO TO 140 + IF( SISNAN( C+CA+R+RA ) ) THEN + INFO = -3 + CALL XERBLA( 'CGEBAL', -INFO ) + RETURN + END IF +* + G = R / SCLFAC + F = ONE + S = C + R +* + DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND. + $ MIN( R, G, RA ).GT.SFMIN2 ) + F = F*SCLFAC + C = C*SCLFAC + CA = CA*SCLFAC + R = R / SCLFAC + G = G / SCLFAC + RA = RA / SCLFAC + END DO +* + G = C / SCLFAC +* + DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND. + $ MIN( F, C, G, CA ).GT.SFMIN2 ) + F = F / SCLFAC + C = C / SCLFAC + G = G / SCLFAC + CA = CA / SCLFAC + R = R*SCLFAC + RA = RA*SCLFAC + END DO +* +* Now balance. +* + IF( ( C+R ).GE.FACTOR*S ) CYCLE + IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN + IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE + END IF + IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN + IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE + END IF + G = ONE / F + SCALE( I ) = SCALE( I )*F + NOCONV = .TRUE. +* + CALL CSSCAL( N-K+1, G, A( I, K ), LDA ) + CALL CSSCAL( L, F, A( 1, I ), 1 ) +* + END DO +* + END DO * - 210 CONTINUE ILO = K IHI = L * diff --git a/lapack-netlib/SRC/dgebal.f b/lapack-netlib/SRC/dgebal.f index 821c7704a..f7b38b378 100644 --- a/lapack-netlib/SRC/dgebal.f +++ b/lapack-netlib/SRC/dgebal.f @@ -153,6 +153,9 @@ *> *> Modified by Tzu-Yi Chen, Computer Science Division, University of *> California at Berkeley, USA +*> +*> Refactored by Evert Provoost, Department of Computer Science, +*> KU Leuven, Belgium *> \endverbatim *> * ===================================================================== @@ -181,8 +184,8 @@ PARAMETER ( FACTOR = 0.95D+0 ) * .. * .. Local Scalars .. - LOGICAL NOCONV - INTEGER I, ICA, IEXC, IRA, J, K, L, M + LOGICAL NOCONV, CANSWAP + INTEGER I, ICA, IRA, J, K, L DOUBLE PRECISION C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1, $ SFMIN2 * .. @@ -214,177 +217,192 @@ RETURN END IF * - K = 1 - L = N +* Quick returns. * - IF( N.EQ.0 ) - $ GO TO 210 + IF( N.EQ.0 ) THEN + ILO = 1 + IHI = 0 + RETURN + END IF * IF( LSAME( JOB, 'N' ) ) THEN - DO 10 I = 1, N + DO I = 1, N SCALE( I ) = ONE - 10 CONTINUE - GO TO 210 + END DO + ILO = 1 + IHI = N + RETURN END IF * - IF( LSAME( JOB, 'S' ) ) - $ GO TO 120 -* -* Permutation to isolate eigenvalues if possible -* - GO TO 50 -* -* Row and column exchange. -* - 20 CONTINUE - SCALE( M ) = J - IF( J.EQ.M ) - $ GO TO 30 -* - CALL DSWAP( L, A( 1, J ), 1, A( 1, M ), 1 ) - CALL DSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA ) -* - 30 CONTINUE - GO TO ( 40, 80 )IEXC -* -* Search for rows isolating an eigenvalue and push them down. -* - 40 CONTINUE - IF( L.EQ.1 ) - $ GO TO 210 - L = L - 1 -* - 50 CONTINUE - DO 70 J = L, 1, -1 +* Permutation to isolate eigenvalues if possible. * - DO 60 I = 1, L - IF( I.EQ.J ) - $ GO TO 60 - IF( A( J, I ).NE.ZERO ) - $ GO TO 70 - 60 CONTINUE -* - M = L - IEXC = 1 - GO TO 20 - 70 CONTINUE -* - GO TO 90 + K = 1 + L = N * -* Search for columns isolating an eigenvalue and push them left. + IF( .NOT.LSAME( JOB, 'S' ) ) THEN * - 80 CONTINUE - K = K + 1 +* Row and column exchange. * - 90 CONTINUE - DO 110 J = K, L + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for rows isolating an eigenvalue and push them down. +* + NOCONV = .FALSE. + DO I = L, 1, -1 + CANSWAP = .TRUE. + DO J = 1, L + IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( L ) = I + IF( I.NE.L ) THEN + CALL DSWAP( L, A( 1, I ), 1, A( 1, L ), 1 ) + CALL DSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA ) + END IF + NOCONV = .TRUE. +* + IF( L.EQ.1 ) THEN + ILO = 1 + IHI = 1 + RETURN + END IF +* + L = L - 1 + END IF + END DO +* + END DO + + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for columns isolating an eigenvalue and push them left. +* + NOCONV = .FALSE. + DO J = K, L + CANSWAP = .TRUE. + DO I = K, L + IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( K ) = J + IF( J.NE.K ) THEN + CALL DSWAP( L, A( 1, J ), 1, A( 1, K ), 1 ) + CALL DSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA ) + END IF + NOCONV = .TRUE. +* + K = K + 1 + END IF + END DO +* + END DO * - DO 100 I = K, L - IF( I.EQ.J ) - $ GO TO 100 - IF( A( I, J ).NE.ZERO ) - $ GO TO 110 - 100 CONTINUE + END IF * - M = K - IEXC = 2 - GO TO 20 - 110 CONTINUE +* Initialize SCALE for non-permuted submatrix. * - 120 CONTINUE - DO 130 I = K, L + DO I = K, L SCALE( I ) = ONE - 130 CONTINUE + END DO * - IF( LSAME( JOB, 'P' ) ) - $ GO TO 210 +* If we only had to permute, we are done. +* + IF( LSAME( JOB, 'P' ) ) THEN + ILO = K + IHI = L + RETURN + END IF * * Balance the submatrix in rows K to L. * -* Iterative loop for norm reduction +* Iterative loop for norm reduction. * SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' ) SFMAX1 = ONE / SFMIN1 SFMIN2 = SFMIN1*SCLFAC SFMAX2 = ONE / SFMIN2 * - 140 CONTINUE - NOCONV = .FALSE. + NOCONV = .TRUE. + DO WHILE( NOCONV ) + NOCONV = .FALSE. * - DO 200 I = K, L + DO I = K, L * - C = DNRM2( L-K+1, A( K, I ), 1 ) - R = DNRM2( L-K+1, A( I, K ), LDA ) - ICA = IDAMAX( L, A( 1, I ), 1 ) - CA = ABS( A( ICA, I ) ) - IRA = IDAMAX( N-K+1, A( I, K ), LDA ) - RA = ABS( A( I, IRA+K-1 ) ) + C = DNRM2( L-K+1, A( K, I ), 1 ) + R = DNRM2( L-K+1, A( I, K ), LDA ) + ICA = IDAMAX( L, A( 1, I ), 1 ) + CA = ABS( A( ICA, I ) ) + IRA = IDAMAX( N-K+1, A( I, K ), LDA ) + RA = ABS( A( I, IRA+K-1 ) ) * -* Guard against zero C or R due to underflow. +* Guard against zero C or R due to underflow. * - IF( C.EQ.ZERO .OR. R.EQ.ZERO ) - $ GO TO 200 - G = R / SCLFAC - F = ONE - S = C + R - 160 CONTINUE - IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR. - $ MIN( R, G, RA ).LE.SFMIN2 )GO TO 170 - IF( DISNAN( C+F+CA+R+G+RA ) ) THEN + IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE * * Exit if NaN to avoid infinite loop * - INFO = -3 - CALL XERBLA( 'DGEBAL', -INFO ) - RETURN - END IF - F = F*SCLFAC - C = C*SCLFAC - CA = CA*SCLFAC - R = R / SCLFAC - G = G / SCLFAC - RA = RA / SCLFAC - GO TO 160 -* - 170 CONTINUE - G = C / SCLFAC - 180 CONTINUE - IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR. - $ MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190 - F = F / SCLFAC - C = C / SCLFAC - G = G / SCLFAC - CA = CA / SCLFAC - R = R*SCLFAC - RA = RA*SCLFAC - GO TO 180 -* -* Now balance. -* - 190 CONTINUE - IF( ( C+R ).GE.FACTOR*S ) - $ GO TO 200 - IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN - IF( F*SCALE( I ).LE.SFMIN1 ) - $ GO TO 200 - END IF - IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN - IF( SCALE( I ).GE.SFMAX1 / F ) - $ GO TO 200 - END IF - G = ONE / F - SCALE( I ) = SCALE( I )*F - NOCONV = .TRUE. -* - CALL DSCAL( N-K+1, G, A( I, K ), LDA ) - CALL DSCAL( L, F, A( 1, I ), 1 ) -* - 200 CONTINUE -* - IF( NOCONV ) - $ GO TO 140 + IF( DISNAN( C+CA+R+RA ) ) THEN + INFO = -3 + CALL XERBLA( 'DGEBAL', -INFO ) + RETURN + END IF +* + G = R / SCLFAC + F = ONE + S = C + R +* + DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND. + $ MIN( R, G, RA ).GT.SFMIN2 ) + F = F*SCLFAC + C = C*SCLFAC + CA = CA*SCLFAC + R = R / SCLFAC + G = G / SCLFAC + RA = RA / SCLFAC + END DO +* + G = C / SCLFAC +* + DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND. + $ MIN( F, C, G, CA ).GT.SFMIN2 ) + F = F / SCLFAC + C = C / SCLFAC + G = G / SCLFAC + CA = CA / SCLFAC + R = R*SCLFAC + RA = RA*SCLFAC + END DO +* +* Now balance. +* + IF( ( C+R ).GE.FACTOR*S ) CYCLE + IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN + IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE + END IF + IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN + IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE + END IF + G = ONE / F + SCALE( I ) = SCALE( I )*F + NOCONV = .TRUE. +* + CALL DSCAL( N-K+1, G, A( I, K ), LDA ) + CALL DSCAL( L, F, A( 1, I ), 1 ) +* + END DO +* + END DO * - 210 CONTINUE ILO = K IHI = L * diff --git a/lapack-netlib/SRC/sgebal.f b/lapack-netlib/SRC/sgebal.f index f519c8c57..7c115fb6c 100644 --- a/lapack-netlib/SRC/sgebal.f +++ b/lapack-netlib/SRC/sgebal.f @@ -153,6 +153,9 @@ *> *> Modified by Tzu-Yi Chen, Computer Science Division, University of *> California at Berkeley, USA +*> +*> Refactored by Evert Provoost, Department of Computer Science, +*> KU Leuven, Belgium *> \endverbatim *> * ===================================================================== @@ -181,8 +184,8 @@ PARAMETER ( FACTOR = 0.95E+0 ) * .. * .. Local Scalars .. - LOGICAL NOCONV - INTEGER I, ICA, IEXC, IRA, J, K, L, M + LOGICAL NOCONV, CANSWAP + INTEGER I, ICA, IRA, J, K, L REAL C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1, $ SFMIN2 * .. @@ -197,7 +200,7 @@ * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN -* +* .. * Test the input parameters * INFO = 0 @@ -214,176 +217,192 @@ RETURN END IF * - K = 1 - L = N +* Quick returns. * - IF( N.EQ.0 ) - $ GO TO 210 + IF( N.EQ.0 ) THEN + ILO = 1 + IHI = 0 + RETURN + END IF * IF( LSAME( JOB, 'N' ) ) THEN - DO 10 I = 1, N + DO I = 1, N SCALE( I ) = ONE - 10 CONTINUE - GO TO 210 + END DO + ILO = 1 + IHI = N + RETURN END IF * - IF( LSAME( JOB, 'S' ) ) - $ GO TO 120 -* -* Permutation to isolate eigenvalues if possible -* - GO TO 50 -* -* Row and column exchange. -* - 20 CONTINUE - SCALE( M ) = J - IF( J.EQ.M ) - $ GO TO 30 -* - CALL SSWAP( L, A( 1, J ), 1, A( 1, M ), 1 ) - CALL SSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA ) -* - 30 CONTINUE - GO TO ( 40, 80 )IEXC -* -* Search for rows isolating an eigenvalue and push them down. -* - 40 CONTINUE - IF( L.EQ.1 ) - $ GO TO 210 - L = L - 1 +* Permutation to isolate eigenvalues if possible. * - 50 CONTINUE - DO 70 J = L, 1, -1 -* - DO 60 I = 1, L - IF( I.EQ.J ) - $ GO TO 60 - IF( A( J, I ).NE.ZERO ) - $ GO TO 70 - 60 CONTINUE -* - M = L - IEXC = 1 - GO TO 20 - 70 CONTINUE -* - GO TO 90 + K = 1 + L = N * -* Search for columns isolating an eigenvalue and push them left. + IF( .NOT.LSAME( JOB, 'S' ) ) THEN * - 80 CONTINUE - K = K + 1 +* Row and column exchange. * - 90 CONTINUE - DO 110 J = K, L + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for rows isolating an eigenvalue and push them down. +* + NOCONV = .FALSE. + DO I = L, 1, -1 + CANSWAP = .TRUE. + DO J = 1, L + IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( L ) = I + IF( I.NE.L ) THEN + CALL SSWAP( L, A( 1, I ), 1, A( 1, L ), 1 ) + CALL SSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA ) + END IF + NOCONV = .TRUE. +* + IF( L.EQ.1 ) THEN + ILO = 1 + IHI = 1 + RETURN + END IF +* + L = L - 1 + END IF + END DO +* + END DO + + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for columns isolating an eigenvalue and push them left. +* + NOCONV = .FALSE. + DO J = K, L + CANSWAP = .TRUE. + DO I = K, L + IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( K ) = J + IF( J.NE.K ) THEN + CALL SSWAP( L, A( 1, J ), 1, A( 1, K ), 1 ) + CALL SSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA ) + END IF + NOCONV = .TRUE. +* + K = K + 1 + END IF + END DO +* + END DO * - DO 100 I = K, L - IF( I.EQ.J ) - $ GO TO 100 - IF( A( I, J ).NE.ZERO ) - $ GO TO 110 - 100 CONTINUE + END IF * - M = K - IEXC = 2 - GO TO 20 - 110 CONTINUE +* Initialize SCALE for non-permuted submatrix. * - 120 CONTINUE - DO 130 I = K, L + DO I = K, L SCALE( I ) = ONE - 130 CONTINUE + END DO * - IF( LSAME( JOB, 'P' ) ) - $ GO TO 210 +* If we only had to permute, we are done. +* + IF( LSAME( JOB, 'P' ) ) THEN + ILO = K + IHI = L + RETURN + END IF * * Balance the submatrix in rows K to L. * -* Iterative loop for norm reduction +* Iterative loop for norm reduction. * SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' ) SFMAX1 = ONE / SFMIN1 SFMIN2 = SFMIN1*SCLFAC SFMAX2 = ONE / SFMIN2 - 140 CONTINUE - NOCONV = .FALSE. -* - DO 200 I = K, L -* - C = SNRM2( L-K+1, A( K, I ), 1 ) - R = SNRM2( L-K+1, A( I, K ), LDA ) - ICA = ISAMAX( L, A( 1, I ), 1 ) - CA = ABS( A( ICA, I ) ) - IRA = ISAMAX( N-K+1, A( I, K ), LDA ) - RA = ABS( A( I, IRA+K-1 ) ) -* -* Guard against zero C or R due to underflow. -* - IF( C.EQ.ZERO .OR. R.EQ.ZERO ) - $ GO TO 200 - G = R / SCLFAC - F = ONE - S = C + R - 160 CONTINUE - IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR. - $ MIN( R, G, RA ).LE.SFMIN2 )GO TO 170 - F = F*SCLFAC - C = C*SCLFAC - CA = CA*SCLFAC - R = R / SCLFAC - G = G / SCLFAC - RA = RA / SCLFAC - GO TO 160 -* - 170 CONTINUE - G = C / SCLFAC - 180 CONTINUE - IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR. - $ MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190 - IF( SISNAN( C+F+CA+R+G+RA ) ) THEN * -* Exit if NaN to avoid infinite loop + NOCONV = .TRUE. + DO WHILE( NOCONV ) + NOCONV = .FALSE. * - INFO = -3 - CALL XERBLA( 'SGEBAL', -INFO ) - RETURN - END IF - F = F / SCLFAC - C = C / SCLFAC - G = G / SCLFAC - CA = CA / SCLFAC - R = R*SCLFAC - RA = RA*SCLFAC - GO TO 180 -* -* Now balance. -* - 190 CONTINUE - IF( ( C+R ).GE.FACTOR*S ) - $ GO TO 200 - IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN - IF( F*SCALE( I ).LE.SFMIN1 ) - $ GO TO 200 - END IF - IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN - IF( SCALE( I ).GE.SFMAX1 / F ) - $ GO TO 200 - END IF - G = ONE / F - SCALE( I ) = SCALE( I )*F - NOCONV = .TRUE. + DO I = K, L +* + C = SNRM2( L-K+1, A( K, I ), 1 ) + R = SNRM2( L-K+1, A( I, K ), LDA ) + ICA = ISAMAX( L, A( 1, I ), 1 ) + CA = ABS( A( ICA, I ) ) + IRA = ISAMAX( N-K+1, A( I, K ), LDA ) + RA = ABS( A( I, IRA+K-1 ) ) * - CALL SSCAL( N-K+1, G, A( I, K ), LDA ) - CALL SSCAL( L, F, A( 1, I ), 1 ) +* Guard against zero C or R due to underflow. * - 200 CONTINUE + IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE +* +* Exit if NaN to avoid infinite loop * - IF( NOCONV ) - $ GO TO 140 + IF( SISNAN( C+CA+R+RA ) ) THEN + INFO = -3 + CALL XERBLA( 'SGEBAL', -INFO ) + RETURN + END IF +* + G = R / SCLFAC + F = ONE + S = C + R +* + DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND. + $ MIN( R, G, RA ).GT.SFMIN2 ) + F = F*SCLFAC + C = C*SCLFAC + CA = CA*SCLFAC + R = R / SCLFAC + G = G / SCLFAC + RA = RA / SCLFAC + END DO +* + G = C / SCLFAC +* + DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND. + $ MIN( F, C, G, CA ).GT.SFMIN2 ) + F = F / SCLFAC + C = C / SCLFAC + G = G / SCLFAC + CA = CA / SCLFAC + R = R*SCLFAC + RA = RA*SCLFAC + END DO +* +* Now balance. +* + IF( ( C+R ).GE.FACTOR*S ) CYCLE + IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN + IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE + END IF + IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN + IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE + END IF + G = ONE / F + SCALE( I ) = SCALE( I )*F + NOCONV = .TRUE. +* + CALL SSCAL( N-K+1, G, A( I, K ), LDA ) + CALL SSCAL( L, F, A( 1, I ), 1 ) +* + END DO +* + END DO * - 210 CONTINUE ILO = K IHI = L * diff --git a/lapack-netlib/SRC/zgebal.f b/lapack-netlib/SRC/zgebal.f index d4a9e39f1..a467991d4 100644 --- a/lapack-netlib/SRC/zgebal.f +++ b/lapack-netlib/SRC/zgebal.f @@ -89,7 +89,7 @@ *> \param[out] IHI *> \verbatim *> IHI is INTEGER -*> ILO and IHI are set to INTEGER such that on exit +*> ILO and IHI are set to integers such that on exit *> A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N. *> If JOB = 'N' or 'S', ILO = 1 and IHI = N. *> \endverbatim @@ -155,6 +155,9 @@ *> *> Modified by Tzu-Yi Chen, Computer Science Division, University of *> California at Berkeley, USA +*> +*> Refactored by Evert Provoost, Department of Computer Science, +*> KU Leuven, Belgium *> \endverbatim *> * ===================================================================== @@ -184,8 +187,8 @@ PARAMETER ( FACTOR = 0.95D+0 ) * .. * .. Local Scalars .. - LOGICAL NOCONV - INTEGER I, ICA, IEXC, IRA, J, K, L, M + LOGICAL NOCONV, CANSWAP + INTEGER I, ICA, IRA, J, K, L DOUBLE PRECISION C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1, $ SFMIN2 * .. @@ -217,176 +220,194 @@ RETURN END IF * - K = 1 - L = N +* Quick returns. * - IF( N.EQ.0 ) - $ GO TO 210 + IF( N.EQ.0 ) THEN + ILO = 1 + IHI = 0 + RETURN + END IF * IF( LSAME( JOB, 'N' ) ) THEN - DO 10 I = 1, N + DO I = 1, N SCALE( I ) = ONE - 10 CONTINUE - GO TO 210 + END DO + ILO = 1 + IHI = N + RETURN END IF * - IF( LSAME( JOB, 'S' ) ) - $ GO TO 120 -* -* Permutation to isolate eigenvalues if possible -* - GO TO 50 -* -* Row and column exchange. -* - 20 CONTINUE - SCALE( M ) = J - IF( J.EQ.M ) - $ GO TO 30 -* - CALL ZSWAP( L, A( 1, J ), 1, A( 1, M ), 1 ) - CALL ZSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA ) -* - 30 CONTINUE - GO TO ( 40, 80 )IEXC -* -* Search for rows isolating an eigenvalue and push them down. -* - 40 CONTINUE - IF( L.EQ.1 ) - $ GO TO 210 - L = L - 1 -* - 50 CONTINUE - DO 70 J = L, 1, -1 +* Permutation to isolate eigenvalues if possible. * - DO 60 I = 1, L - IF( I.EQ.J ) - $ GO TO 60 - IF( DBLE( A( J, I ) ).NE.ZERO .OR. DIMAG( A( J, I ) ).NE. - $ ZERO )GO TO 70 - 60 CONTINUE -* - M = L - IEXC = 1 - GO TO 20 - 70 CONTINUE -* - GO TO 90 + K = 1 + L = N * -* Search for columns isolating an eigenvalue and push them left. + IF( .NOT.LSAME( JOB, 'S' ) ) THEN * - 80 CONTINUE - K = K + 1 +* Row and column exchange. * - 90 CONTINUE - DO 110 J = K, L + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for rows isolating an eigenvalue and push them down. +* + NOCONV = .FALSE. + DO I = L, 1, -1 + CANSWAP = .TRUE. + DO J = 1, L + IF( I.NE.J .AND. ( DBLE( A( I, J ) ).NE.ZERO .OR. + $ DIMAG( A( I, J ) ).NE.ZERO ) ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( L ) = I + IF( I.NE.L ) THEN + CALL ZSWAP( L, A( 1, I ), 1, A( 1, L ), 1 ) + CALL ZSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA ) + END IF + NOCONV = .TRUE. +* + IF( L.EQ.1 ) THEN + ILO = 1 + IHI = 1 + RETURN + END IF +* + L = L - 1 + END IF + END DO +* + END DO + + NOCONV = .TRUE. + DO WHILE( NOCONV ) +* +* Search for columns isolating an eigenvalue and push them left. +* + NOCONV = .FALSE. + DO J = K, L + CANSWAP = .TRUE. + DO I = K, L + IF( I.NE.J .AND. ( DBLE( A( I, J ) ).NE.ZERO .OR. + $ DIMAG( A( I, J ) ).NE.ZERO ) ) THEN + CANSWAP = .FALSE. + EXIT + END IF + END DO +* + IF( CANSWAP ) THEN + SCALE( K ) = J + IF( J.NE.K ) THEN + CALL ZSWAP( L, A( 1, J ), 1, A( 1, K ), 1 ) + CALL ZSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA ) + END IF + NOCONV = .TRUE. +* + K = K + 1 + END IF + END DO +* + END DO * - DO 100 I = K, L - IF( I.EQ.J ) - $ GO TO 100 - IF( DBLE( A( I, J ) ).NE.ZERO .OR. DIMAG( A( I, J ) ).NE. - $ ZERO )GO TO 110 - 100 CONTINUE + END IF * - M = K - IEXC = 2 - GO TO 20 - 110 CONTINUE +* Initialize SCALE for non-permuted submatrix. * - 120 CONTINUE - DO 130 I = K, L + DO I = K, L SCALE( I ) = ONE - 130 CONTINUE + END DO * - IF( LSAME( JOB, 'P' ) ) - $ GO TO 210 +* If we only had to permute, we are done. +* + IF( LSAME( JOB, 'P' ) ) THEN + ILO = K + IHI = L + RETURN + END IF * * Balance the submatrix in rows K to L. * -* Iterative loop for norm reduction +* Iterative loop for norm reduction. * SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' ) SFMAX1 = ONE / SFMIN1 SFMIN2 = SFMIN1*SCLFAC SFMAX2 = ONE / SFMIN2 - 140 CONTINUE - NOCONV = .FALSE. -* - DO 200 I = K, L -* - C = DZNRM2( L-K+1, A( K, I ), 1 ) - R = DZNRM2( L-K+1, A( I, K ), LDA ) - ICA = IZAMAX( L, A( 1, I ), 1 ) - CA = ABS( A( ICA, I ) ) - IRA = IZAMAX( N-K+1, A( I, K ), LDA ) - RA = ABS( A( I, IRA+K-1 ) ) -* -* Guard against zero C or R due to underflow. -* - IF( C.EQ.ZERO .OR. R.EQ.ZERO ) - $ GO TO 200 - G = R / SCLFAC - F = ONE - S = C + R - 160 CONTINUE - IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR. - $ MIN( R, G, RA ).LE.SFMIN2 )GO TO 170 - IF( DISNAN( C+F+CA+R+G+RA ) ) THEN * -* Exit if NaN to avoid infinite loop + NOCONV = .TRUE. + DO WHILE( NOCONV ) + NOCONV = .FALSE. * - INFO = -3 - CALL XERBLA( 'ZGEBAL', -INFO ) - RETURN - END IF - F = F*SCLFAC - C = C*SCLFAC - CA = CA*SCLFAC - R = R / SCLFAC - G = G / SCLFAC - RA = RA / SCLFAC - GO TO 160 -* - 170 CONTINUE - G = C / SCLFAC - 180 CONTINUE - IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR. - $ MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190 - F = F / SCLFAC - C = C / SCLFAC - G = G / SCLFAC - CA = CA / SCLFAC - R = R*SCLFAC - RA = RA*SCLFAC - GO TO 180 -* -* Now balance. -* - 190 CONTINUE - IF( ( C+R ).GE.FACTOR*S ) - $ GO TO 200 - IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN - IF( F*SCALE( I ).LE.SFMIN1 ) - $ GO TO 200 - END IF - IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN - IF( SCALE( I ).GE.SFMAX1 / F ) - $ GO TO 200 - END IF - G = ONE / F - SCALE( I ) = SCALE( I )*F - NOCONV = .TRUE. + DO I = K, L * - CALL ZDSCAL( N-K+1, G, A( I, K ), LDA ) - CALL ZDSCAL( L, F, A( 1, I ), 1 ) + C = DZNRM2( L-K+1, A( K, I ), 1 ) + R = DZNRM2( L-K+1, A( I, K ), LDA ) + ICA = IZAMAX( L, A( 1, I ), 1 ) + CA = ABS( A( ICA, I ) ) + IRA = IZAMAX( N-K+1, A( I, K ), LDA ) + RA = ABS( A( I, IRA+K-1 ) ) * - 200 CONTINUE +* Guard against zero C or R due to underflow. +* + IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE +* +* Exit if NaN to avoid infinite loop * - IF( NOCONV ) - $ GO TO 140 + IF( DISNAN( C+CA+R+RA ) ) THEN + INFO = -3 + CALL XERBLA( 'ZGEBAL', -INFO ) + RETURN + END IF +* + G = R / SCLFAC + F = ONE + S = C + R +* + DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND. + $ MIN( R, G, RA ).GT.SFMIN2 ) + F = F*SCLFAC + C = C*SCLFAC + CA = CA*SCLFAC + R = R / SCLFAC + G = G / SCLFAC + RA = RA / SCLFAC + END DO +* + G = C / SCLFAC +* + DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND. + $ MIN( F, C, G, CA ).GT.SFMIN2 ) + F = F / SCLFAC + C = C / SCLFAC + G = G / SCLFAC + CA = CA / SCLFAC + R = R*SCLFAC + RA = RA*SCLFAC + END DO +* +* Now balance. +* + IF( ( C+R ).GE.FACTOR*S ) CYCLE + IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN + IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE + END IF + IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN + IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE + END IF + G = ONE / F + SCALE( I ) = SCALE( I )*F + NOCONV = .TRUE. +* + CALL ZDSCAL( N-K+1, G, A( I, K ), LDA ) + CALL ZDSCAL( L, F, A( 1, I ), 1 ) +* + END DO +* + END DO * - 210 CONTINUE ILO = K IHI = L * From a4ee1c84f020e7450dd6e27e5909d0493724cf7e Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Fri, 7 Apr 2023 12:50:36 +0100 Subject: [PATCH 021/718] Export `ssyconvf` symbol This was apparently missed in commit a836fe8ec when adding the LAPACK 3.7.0 symbols. We noticed when adding wrappers for 3.7.0 routines in SciPy. For more details, see https://github.com/rgommers/scipy/issues/143 --- exports/gensymbol | 1 + 1 file changed, 1 insertion(+) diff --git a/exports/gensymbol b/exports/gensymbol index f05de626f..5823c0b3b 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -716,6 +716,7 @@ lapackobjs2z="$lapackobjs2z # functions added for lapack-3.7.0 lapackobjs2s="$lapackobjs2s slarfy + ssyconvf strevc3 sgelqt sgelqt3 From d677214570b3aa024eb74f0da7e7995b5b7e9bc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Apr 2023 14:11:16 +0200 Subject: [PATCH 022/718] Remove the badge for the dead drone.io service and add Cirrus CI in its place --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6ce85e08e..a2eac07be 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=dev AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) -Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/) +Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS) + [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) From 3effdc15053a53acc36c421ee8df78cf78879fa9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Apr 2023 19:32:22 +0200 Subject: [PATCH 023/718] Protect CROSS_PATH against spurious addition of linebreaks from isolated dashes fix for #3989 --- c_check | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/c_check b/c_check index e8f90e18a..7173968e5 100755 --- a/c_check +++ b/c_check @@ -35,9 +35,12 @@ if [ "`dirname \"$compiler_name\"`" != '.' ]; then cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/" fi -bn=`basename $compiler_name` +bn=`basename \"$compiler_name\"` + case "$bn" in - *-*) cross_suffix="$cross_suffix${bn%-*}-" + *-*) if [ "$bn" != '-']; then + cross_suffix="$cross_suffix${bn%-*}-" + fi esac compiler="" From fd20a2e8c6c4aa2f47cd1e7019fd9b51176d393e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 Apr 2023 22:28:00 +0200 Subject: [PATCH 024/718] Convert CMAKE booleans to 0/1 values for gensymbol --- CMakeLists.txt | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 35fd830ee..8ecd95a95 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -398,15 +398,45 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") endif() + if (${BUILD_LAPACK_DEPRECATED}) + set (BLD 1) + else () + set (BLD 0) + endif() + if (${BUILD_BFLOAT16}) + set (BBF16 1) + else () + set (BBF16 0) + endif() + if (${BUILD_SINGLE}) + set (BS 1) + else () + set (BS 0) + endif() + if (${BUILD_DOUBLE}) + set (BD 1) + else () + set (BD 0) + endif() + if (${BUILD_COMPLEX}) + set (BC 1) + else () + set (BC 0) + endif() + if (${BUILD_COMPLEX16}) + set (BZ 1) + else () + set (BZ 0) + endif() if (NOT USE_PERL) add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) else() add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) From d5fbec7c20e2bcc6c088cf62029ff5e9a879fcd9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 Apr 2023 23:49:35 +0200 Subject: [PATCH 025/718] Export ?MIN/?MAX, ?AMIN/?AMAX, CDOT/ZDOT and ?GEMMT --- exports/gensymbol | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 5823c0b3b..7ce85fd7d 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -17,11 +17,11 @@ # removed blas_thread_shutdown_ # blasobjsc=" - caxpy caxpby ccopy cdotc cdotu cgbmv cgemm cgemv cgerc cgeru + camin camax caxpy caxpby ccopy cdot cdotc cdotu cgbmv cgemm cgemv cgerc cgeru chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2 csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm - ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum" + ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt" blasobjsd=" damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm @@ -29,7 +29,7 @@ blasobjsd=" dscal dsdot dspmv dspr2 dimatcopy domatcopy dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv - idamax idamin idmax idmin dgeadd dsum" + idamax idamin idmax idmin dgeadd dsum dgemmt" blasobjss=" isamax isamin ismax ismin @@ -38,58 +38,58 @@ blasobjss=" smax smin snrm2 simatcopy somatcopy srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv - strmm strmv strsm strsv sgeadd ssum" + strmm strmv strsm strsv sgeadd ssum sgemmt" blasobjsz=" - izamax izamin - zaxpy zaxpby zcopy zdotc zdotu zdrot + zamin zamax izamax izamin + zaxpy zaxpby zcopy zdot zdotc zdotu zdrot zdscal zgbmv zgemm zgemv zgerc zgeru zhbmv zhemm zhemv zher2 zher2k zher zherk zhpmv zhpr2 zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv zomatcopy zimatcopy dzamax dzamin dzasum dznrm2 - zgeadd dzsum" + zgeadd dzsum zgemmt" blasobjs="lsame xerbla" bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod" cblasobjsc=" - cblas_caxpy cblas_ccopy cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv + cblas_caxpy cblas_ccopy cblas_cdot cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv cblas_cgerc cblas_cgeru cblas_chbmv cblas_chemm cblas_chemv cblas_cher2 cblas_cher2k cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv - cblas_scnrm2 cblas_scasum + cblas_scnrm2 cblas_scasum cblas_camin cblas_camax cblas_cgemmt cblas_cmin cblas_cmax cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy " cblasobjsd=" - cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot - cblas_dgbmv cblas_dgemm cblas_dgemv cblas_dger cblas_dnrm2 + cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot cblas_damin cblas_damax + cblas_dgbmv cblas_dgemm cblas_dgemv cblas_dger cblas_dnrm2 cblas_dmin cblas_dmax cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2 cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv - cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd + cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy " cblasobjss=" - cblas_sasum cblas_saxpy cblas_saxpby + cblas_sasum cblas_saxpy cblas_saxpby cblas_samin cblas_samax cblas_smax cblas_smin cblas_scopy cblas_sdot cblas_sdsdot cblas_sgbmv cblas_sgemm cblas_sgemv cblas_sger cblas_snrm2 cblas_srot cblas_srotg cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm - cblas_strsv cblas_sgeadd + cblas_strsv cblas_sgeadd cblas_sgemmt cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy " cblasobjsz=" - cblas_dzasum cblas_dznrm2 cblas_zaxpy cblas_zcopy cblas_zdotc cblas_zdotu cblas_zdscal + cblas_dzasum cblas_dznrm2 cblas_zaxpy cblas_zcopy cblas_zdot cblas_zdotc cblas_zdotu cblas_zdscal cblas_zgbmv cblas_zgemm cblas_zgemv cblas_zgerc cblas_zgeru cblas_zhbmv cblas_zhemm cblas_zhemv cblas_zher2 cblas_zher2k cblas_zher cblas_zherk cblas_zhpmv cblas_zhpr2 cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub - cblas_zaxpby cblas_zgeadd + cblas_zaxpby cblas_zgeadd cblas_zamin cblas_zamax cblas_zgemmt cblas_zmin cblas_zmax cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy " From caa2945138f3c8a6f3f0dacbaf653c283e3cd2cb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Apr 2023 00:04:09 +0200 Subject: [PATCH 026/718] Support Apple A15/M2 cpus through the existing VORTEX target --- cpuid_arm64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 1080ea974..809f48e95 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -268,7 +268,8 @@ int detect(void) #else #ifdef __APPLE__ sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; + if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; //A12/M1 + if (value == 3660830781) return CPU_VORTEX; //A15/M2 #endif return CPU_ARMV8; #endif From 57bdc36c846cd44396e1d39a5f7a191bda363503 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Apr 2023 22:38:38 +0200 Subject: [PATCH 027/718] add conditionals for BUILD_LAPACK_DEPRECATED --- cmake/lapack.cmake | 4 ++++ cmake/lapacke.cmake | 30 ++++++------------------------ 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 45dda8686..544e226ab 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -436,6 +436,7 @@ if(USE_XBLAS) set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) endif() +if(BUILD_LAPACK_DEPRECATED) list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f) @@ -449,6 +450,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f) message(STATUS "Building deprecated routines") +endif() set(DSLASRC spotrs.f) @@ -930,6 +932,7 @@ if(USE_XBLAS) set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) endif() +if(BUILD_LAPACK_DEPRECATED) list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c) @@ -943,6 +946,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c) message(STATUS "Building deprecated routines") +endif() set(DSLASRC spotrs.c) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index 3a9352197..be6a286fe 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -70,8 +70,6 @@ set(CSRC lapacke_cgeqlf_work.c lapacke_cgeqp3.c lapacke_cgeqp3_work.c - lapacke_cgeqpf.c - lapacke_cgeqpf_work.c lapacke_cgeqr.c lapacke_cgeqr_work.c lapacke_cgeqr2.c @@ -144,12 +142,8 @@ set(CSRC lapacke_cggqrf_work.c lapacke_cggrqf.c lapacke_cggrqf_work.c - lapacke_cggsvd.c - lapacke_cggsvd_work.c lapacke_cggsvd3.c lapacke_cggsvd3_work.c - lapacke_cggsvp.c - lapacke_cggsvp_work.c lapacke_cggsvp3.c lapacke_cggsvp3_work.c lapacke_cgtcon.c @@ -695,8 +689,6 @@ set(DSRC lapacke_dgeqlf_work.c lapacke_dgeqp3.c lapacke_dgeqp3_work.c - lapacke_dgeqpf.c - lapacke_dgeqpf_work.c lapacke_dgeqr.c lapacke_dgeqr_work.c lapacke_dgeqr2.c @@ -771,12 +763,8 @@ set(DSRC lapacke_dggqrf_work.c lapacke_dggrqf.c lapacke_dggrqf_work.c - lapacke_dggsvd.c - lapacke_dggsvd_work.c lapacke_dggsvd3.c lapacke_dggsvd3_work.c - lapacke_dggsvp.c - lapacke_dggsvp_work.c lapacke_dggsvp3.c lapacke_dggsvp3_work.c lapacke_dgtcon.c @@ -1275,8 +1263,6 @@ set(SSRC lapacke_sgeqlf_work.c lapacke_sgeqp3.c lapacke_sgeqp3_work.c - lapacke_sgeqpf.c - lapacke_sgeqpf_work.c lapacke_sgeqr.c lapacke_sgeqr_work.c lapacke_sgeqr2.c @@ -1351,12 +1337,8 @@ set(SSRC lapacke_sggqrf_work.c lapacke_sggrqf.c lapacke_sggrqf_work.c - lapacke_sggsvd.c - lapacke_sggsvd_work.c lapacke_sggsvd3.c lapacke_sggsvd3_work.c - lapacke_sggsvp.c - lapacke_sggsvp_work.c lapacke_sggsvp3.c lapacke_sggsvp3_work.c lapacke_sgtcon.c @@ -1849,8 +1831,6 @@ set(ZSRC lapacke_zgeqlf_work.c lapacke_zgeqp3.c lapacke_zgeqp3_work.c - lapacke_zgeqpf.c - lapacke_zgeqpf_work.c lapacke_zgeqr.c lapacke_zgeqr_work.c lapacke_zgeqr2.c @@ -1925,12 +1905,8 @@ set(ZSRC lapacke_zggqrf_work.c lapacke_zggrqf.c lapacke_zggrqf_work.c - lapacke_zggsvd.c - lapacke_zggsvd_work.c lapacke_zggsvd3.c lapacke_zggsvd3_work.c - lapacke_zggsvp.c - lapacke_zggsvp_work.c lapacke_zggsvp3.c lapacke_zggsvp3_work.c lapacke_zgtcon.c @@ -2401,6 +2377,12 @@ set(ZSRC lapacke_csyr_work.c lapacke_ilaver.c ) +if (BUILD_LAPACK_DEPRECATED) +set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) +set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) +set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) +set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) +endif() set(SRCX lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c From cd8eb33a9c989b479367c2bdd33d7c843c27e3fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Apr 2023 22:39:53 +0200 Subject: [PATCH 028/718] Expose BUILD_LAPACK_DEPRECATED --- CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ecd95a95..d59290c90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,8 @@ include(CMakePackageConfigHelpers) ####### option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) +option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) + option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) @@ -398,12 +400,12 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") endif() - if (${BUILD_LAPACK_DEPRECATED}) + if (${BUILD_LAPACK_DEPRECATED}) set (BLD 1) else () set (BLD 0) endif() - if (${BUILD_BFLOAT16}) + if (${BUILD_BFLOAT16}) set (BBF16 1) else () set (BBF16 0) From 6c45c980835cc0fc0d6fc8751349af54bdaa8426 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Apr 2023 22:41:18 +0200 Subject: [PATCH 029/718] Add (only) the GEMMT functions --- exports/gensymbol | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 7ce85fd7d..b584167a4 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -17,7 +17,7 @@ # removed blas_thread_shutdown_ # blasobjsc=" - camin camax caxpy caxpby ccopy cdot cdotc cdotu cgbmv cgemm cgemv cgerc cgeru + caxpy caxpby ccopy cdotc cdotu cgbmv cgemm cgemv cgerc cgeru chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2 csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm @@ -41,8 +41,8 @@ blasobjss=" strmm strmv strsm strsv sgeadd ssum sgemmt" blasobjsz=" - zamin zamax izamax izamin - zaxpy zaxpby zcopy zdot zdotc zdotu zdrot + izamax izamin + zaxpy zaxpby zcopy zdotc zdotu zdrot zdscal zgbmv zgemm zgemv zgerc zgeru zhbmv zhemm zhemv zher2 zher2k zher zherk zhpmv zhpr2 zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv @@ -53,17 +53,17 @@ blasobjsz=" blasobjs="lsame xerbla" bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod" cblasobjsc=" - cblas_caxpy cblas_ccopy cblas_cdot cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv + cblas_caxpy cblas_ccopy cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv cblas_cgerc cblas_cgeru cblas_chbmv cblas_chemm cblas_chemv cblas_cher2 cblas_cher2k cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv - cblas_scnrm2 cblas_scasum cblas_camin cblas_camax cblas_cgemmt cblas_cmin cblas_cmax + cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy " cblasobjsd=" - cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot cblas_damin cblas_damax - cblas_dgbmv cblas_dgemm cblas_dgemv cblas_dger cblas_dnrm2 cblas_dmin cblas_dmax + cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot + cblas_dgbmv cblas_dgemm cblas_dgemv cblas_dger cblas_dnrm2 cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2 cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv @@ -72,24 +72,24 @@ cblasobjsd=" " cblasobjss=" - cblas_sasum cblas_saxpy cblas_saxpby cblas_samin cblas_samax cblas_smax cblas_smin + cblas_sasum cblas_saxpy cblas_saxpby cblas_scopy cblas_sdot cblas_sdsdot cblas_sgbmv cblas_sgemm cblas_sgemv cblas_sger cblas_snrm2 cblas_srot cblas_srotg cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm - cblas_strsv cblas_sgeadd cblas_sgemmt + cblas_strsv cblas_sgeadd cblas_sgemmt cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy " cblasobjsz=" - cblas_dzasum cblas_dznrm2 cblas_zaxpy cblas_zcopy cblas_zdot cblas_zdotc cblas_zdotu cblas_zdscal + cblas_dzasum cblas_dznrm2 cblas_zaxpy cblas_zcopy cblas_zdotc cblas_zdotu cblas_zdscal cblas_zgbmv cblas_zgemm cblas_zgemv cblas_zgerc cblas_zgeru cblas_zhbmv cblas_zhemm cblas_zhemv cblas_zher2 cblas_zher2k cblas_zher cblas_zherk cblas_zhpmv cblas_zhpr2 cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub - cblas_zaxpby cblas_zgeadd cblas_zamin cblas_zamax cblas_zgemmt cblas_zmin cblas_zmax + cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy " From 2ea00788c271c7a5727b13f7a90433ce21639042 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Apr 2023 22:46:51 +0200 Subject: [PATCH 030/718] Add ?GEMMT --- exports/gensymbol.pl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl index e38a3cc89..dd79e924d 100644 --- a/exports/gensymbol.pl +++ b/exports/gensymbol.pl @@ -21,7 +21,7 @@ chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, - ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum); + ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt); @blasobjsd = ( damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, @@ -29,7 +29,7 @@ dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, - idamax,idamin,idmax,idmin,dgeadd,dsum); + idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt); @blasobjss = ( isamax,isamin,ismax,ismin, @@ -38,7 +38,7 @@ smax,smin,snrm2,simatcopy,somatcopy, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, - strmm,strmv,strsm,strsv, sgeadd,ssum); + strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt); @blasobjsz = ( izamax,izamin,, @@ -48,7 +48,7 @@ zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, - zgeadd, dzsum); + zgeadd, dzsum, zgemmt); @blasobjs = (lsame, xerbla); @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @@ -60,7 +60,7 @@ cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, cblas_scnrm2, cblas_scasum, cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy - ); + cblas_cgemmt); @cblasobjsd = ( cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, @@ -69,7 +69,7 @@ cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy - ); + cblas_dgemmt); @cblasobjss = ( cblas_sasum, cblas_saxpy, cblas_saxpby, @@ -80,7 +80,7 @@ cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, cblas_strsv, cblas_sgeadd, cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy - ); + cblas_sgemmt); @cblasobjsz = ( cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, @@ -90,7 +90,7 @@ cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, cblas_zaxpby, cblas_zgeadd, cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy -); + cblas_zgemmt); @cblasobjs = ( cblas_xerbla ); From ac650225c1cba69b3aba0d52031af838e3b6d1dc Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Thu, 13 Apr 2023 00:08:27 +0800 Subject: [PATCH 031/718] Fix x86 detection error --- cpuid_x86.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index ad13a8c8c..69cbba90e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1936,7 +1936,8 @@ static char *corename[] = { "ZEN", "SKYLAKEX", "DHYANA", - "COOPERLAKE" + "COOPERLAKE", + "SAPPHIRERAPIDS", }; static char *corename_lower[] = { @@ -1970,7 +1971,8 @@ static char *corename_lower[] = { "zen", "skylakex", "dhyana", - "cooperlake" + "cooperlake", + "sapphirerapids", }; @@ -2276,16 +2278,18 @@ int get_coretype(void){ return CORE_NEHALEM; } if (model == 15) { // Sapphire Rapids + if(support_amx_bf16()) + return CORE_SAPPHIRERAPIDS; if(support_avx512_bf16()) - return CPUTYPE_COOPERLAKE; + return CORE_COOPERLAKE; if(support_avx512()) - return CPUTYPE_SKYLAKEX; + return CORE_SKYLAKEX; if(support_avx2()) - return CPUTYPE_HASWELL; + return CORE_HASWELL; if(support_avx()) - return CPUTYPE_SANDYBRIDGE; + return CORE_SANDYBRIDGE; else - return CPUTYPE_NEHALEM; + return CORE_NEHALEM; } break; From cda29633a30bf7ecbc64f85e4bcc6517ad954f1c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 17:59:48 +0200 Subject: [PATCH 032/718] move ALPHA_I out of register 18 (reserved on OSX) --- kernel/arm64/cgemm_kernel_8x4.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S index 24e08a646..f100adc7a 100644 --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR w17 -#define alphaI w18 +#define alphaI w19 #define alpha0_R s10 #define alphaV0_R v10.s[0] From c7bbad09adf8cdd2fa4b8709ea669e530a0136a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 18:00:47 +0200 Subject: [PATCH 033/718] Move ALPHA_I out of register 18 (reserved on OSX) --- kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S index 29a68ff22..2c63925be 100644 --- a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR w17 -#define alphaI w18 +#define alphaI w19 #define alpha0_R s10 #define alphaV0_R v10.s[0] From 0b1acb0ba3aa327fee65bc6bcf596080dfc39f4b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 18:03:35 +0200 Subject: [PATCH 034/718] Move ALPHA_I out of register 18 (reserved on OSX) --- kernel/arm64/ctrmm_kernel_8x4.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S index 5c0827397..e8f1d8cf3 100644 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR w17 -#define alphaI w18 -#define temp x19 -#define tempOffset x20 -#define tempK x21 +#define alphaI w19 +#define temp x20 +#define tempOffset x21 +#define tempK x22 #define alpha0_R s10 #define alphaV0_R v10.s[0] From 108a21e47a754032a9fb5477afcb76c6c158a146 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 18:05:14 +0200 Subject: [PATCH 035/718] Move ALPHA out of register 18 (reserved on OSX) --- kernel/arm64/sgemm_kernel_sve_v2x8.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S index c969ed4db..60e1f347b 100644 --- a/kernel/arm64/sgemm_kernel_sve_v2x8.S +++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S @@ -55,8 +55,8 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ #define lanes x15 #define pA1 x16 #define pA2 x17 -#define alpha w18 -#define vec_len x19 +#define alpha w19 +#define vec_len x20 #define vec_lenx2 x20 #define alpha0 s10 From 3727672a74c18938230c3a2db012a5693688bfd6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 18:07:52 +0200 Subject: [PATCH 036/718] Improve workaround and keep compilers from optimizing it out --- kernel/arm64/dznrm2_thunderx2t99.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index e342b0b63..0bd274b3f 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" - +#include #include #if defined(SMP) @@ -344,6 +344,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT dummy_alpha[2]; #endif FLOAT ssq, scale; + volatile FLOAT sca; if (n <= 0 || inc_x <= 0) return 0.0; @@ -404,7 +405,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else nrm2_compute(n, x, inc_x, &ssq, &scale); #endif - if (fabs(scale) <1.e-300) return 0.; + sca = fabs(scale); + if (sca < DBL_MIN) return 0.; ssq = sqrt(ssq) * scale; return ssq; From f096a339e4a22f4bc6dc454640e5d4007b07368b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Apr 2023 18:16:09 +0200 Subject: [PATCH 037/718] Use long value fields for cpu ident on OSX --- cpuid_arm64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 809f48e95..e586f9a3c 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -267,9 +267,9 @@ int detect(void) } #else #ifdef __APPLE__ - sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; //A12/M1 - if (value == 3660830781) return CPU_VORTEX; //A15/M2 + sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); + if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 + if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 #endif return CPU_ARMV8; #endif From 970e611e007eeb180ce963af89746d9e84f90e8f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Apr 2023 19:42:34 +0200 Subject: [PATCH 038/718] fix missing blank in test --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 7173968e5..9be152b12 100755 --- a/c_check +++ b/c_check @@ -38,7 +38,7 @@ fi bn=`basename \"$compiler_name\"` case "$bn" in - *-*) if [ "$bn" != '-']; then + *-*) if [ "$bn" != '-' ]; then cross_suffix="$cross_suffix${bn%-*}-" fi esac From 8be68fa7f4edfa0c65949faf67f8feea2c7f0f43 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Apr 2023 12:02:39 +0200 Subject: [PATCH 039/718] move declaration of sca to really keep the compiler from throwing it out (for now) --- kernel/arm64/dznrm2_thunderx2t99.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index 0bd274b3f..6077c85dd 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -344,7 +344,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT dummy_alpha[2]; #endif FLOAT ssq, scale; - volatile FLOAT sca; if (n <= 0 || inc_x <= 0) return 0.0; @@ -405,7 +404,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else nrm2_compute(n, x, inc_x, &ssq, &scale); #endif - sca = fabs(scale); + volatile FLOAT sca = fabs(scale); if (sca < DBL_MIN) return 0.; ssq = sqrt(ssq) * scale; From 38d7a7b562860555a6440953124eb47cf7d5b506 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Apr 2023 00:07:58 +0200 Subject: [PATCH 040/718] Fix ?GEMMT --- interface/gemmt.c | 100 +++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index 3eed1dfe4..d35406411 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -35,29 +35,26 @@ #include #include #include "common.h" -#ifdef FUNCTION_PROFILE -#include "functable.h" -#endif #ifndef COMPLEX #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE -#define ERROR_NAME "QGEMT " +#define ERROR_NAME "QGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "DGEMT " +#define ERROR_NAME "DGEMMT " #elif defined(BFLOAT16) -#define ERROR_NAME "SBGEMT " +#define ERROR_NAME "SBGEMMT " #else -#define ERROR_NAME "SGEMT " +#define ERROR_NAME "SGEMMT " #endif #else #define SMP_THRESHOLD_MIN 8192.0 #ifdef XDOUBLE -#define ERROR_NAME "XGEMT " +#define ERROR_NAME "XGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "ZGEMT " +#define ERROR_NAME "ZGEMMT " #else -#define ERROR_NAME "CGEMT " +#define ERROR_NAME "CGEMMT " #endif #endif @@ -68,13 +65,13 @@ #ifndef CBLAS void NAME(char *UPLO, char *TRANSA, char *TRANSB, - blasint * M, blasint * N, blasint * K, + blasint * M, blasint * K, FLOAT * Alpha, IFLOAT * a, blasint * ldA, IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) { - blasint m, n, k; + blasint m, k; blasint lda, ldb, ldc; int transa, transb, uplo; blasint info; @@ -92,7 +89,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, PRINT_DEBUG_NAME; m = *M; - n = *N; k = *K; #if defined(COMPLEX) @@ -167,8 +163,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, info = 13; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) info = 3; if (transb < 0) @@ -184,7 +178,7 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, - blasint N, blasint k, + blasint k, #ifndef COMPLEX FLOAT alpha, IFLOAT * A, blasint LDA, @@ -205,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int transa, transb, uplo; blasint info; - blasint m, n, lda, ldb; + blasint m, lda, ldb; FLOAT *a, *b; XFLOAT *buffer; @@ -248,9 +242,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, transb = 3; #endif - m = M; - n = N; - a = (void *)A; b = (void *)B; lda = LDA; @@ -262,8 +253,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = 13; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) info = 3; if (transb < 0) @@ -273,8 +262,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } if (order == CblasRowMajor) { - m = N; - n = M; a = (void *)B; b = (void *)A; @@ -319,8 +306,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = 13; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) info = 3; if (transb < 0) @@ -407,37 +392,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif - if ((m == 0) || (n == 0)) + if ((m == 0) ) return; IDEBUG_START; - FUNCTION_PROFILE_START(); - const blasint incb = (transb == 0) ? 1 : ldb; if (uplo == 1) { - for (i = 0; i < n; i++) { - j = n - i; + for (i = 0; i < m; i++) { + j = m - i; l = j; #if defined(COMPLEX) aa = a + i * 2; bb = b + i * ldb * 2; if (transa) { - l = k; aa = a + lda * i * 2; - bb = b + i * 2; } + if (transb) + bb = b + i * 2; cc = c + i * 2 * ldc + i * 2; #else aa = a + i; bb = b + i * ldb; if (transa) { - l = k; aa = a + lda * i; - bb = b + i; } + if (transb) + bb = b + i; cc = c + i * ldc + i; #endif @@ -458,8 +441,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_START; - FUNCTION_PROFILE_START(); - buffer_size = j + k + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); @@ -479,20 +460,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!transa) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + aa, lda, bb, incb, cc, 1, + buffer); #else + if (!transa) (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, aa, lda, + bb, incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!transa) (gemv_thread[(int)transa]) (j, k, alpha, aa, lda, bb, incb, cc, 1, buffer, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, cc, + 1, buffer, + nthreads); } #endif @@ -501,21 +496,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } } else { - for (i = 0; i < n; i++) { + for (i = 0; i < m; i++) { j = i + 1; l = j; #if defined COMPLEX bb = b + i * ldb * 2; - if (transa) { - l = k; + if (transb) { bb = b + i * 2; } cc = c + i * 2 * ldc; #else bb = b + i * ldb; - if (transa) { - l = k; + if (transb) { bb = b + i; } cc = c + i * ldc; @@ -537,8 +530,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif IDEBUG_START; - FUNCTION_PROFILE_START(); - buffer_size = j + k + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); @@ -558,30 +549,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!transa) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + a, lda, bb, incb, cc, 1, + buffer); #else + if (!transa) (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb, + incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!transa) (gemv_thread[(int)transa]) (j, k, alpha, a, lda, bb, incb, cc, 1, buffer, nthreads); - + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, cc, 1, + buffer, nthreads); } #endif STACK_FREE(buffer); } } - FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, - args.m * args.k + args.k * args.n + - args.m * args.n, 2 * args.m * args.n * args.k); IDEBUG_END; From bfc20c2e97c9695d1f04c6b5ad16d21fb6c1db76 Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Mon, 17 Apr 2023 11:17:42 +0100 Subject: [PATCH 041/718] Add Chris Sidebottom to CONTRIBUTORS.md --- CONTRIBUTORS.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f5e9dda91..71df13634 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -23,6 +23,9 @@ * Optimization on AMD Piledriver * Optimization on Intel Haswell +* Chris Sidebottom + * Optimizations and other improvements targeting AArch64 + ## Previous Developers * Zaheer Chothia @@ -212,4 +215,4 @@ In chronological order: * [2022-03] Support RISC-V Vector Intrinisc 1.0 version. * Pablo Romero - * [2022-08] Fix building from sources for QNX \ No newline at end of file + * [2022-08] Fix building from sources for QNX From 44164e3a3d7f5c956728596b9f88d43cad0a8c14 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Apr 2023 14:23:13 +0200 Subject: [PATCH 042/718] revert "move alpha out of register 18" (out of PR scope, no SVE on Apple hw) --- kernel/arm64/sgemm_kernel_sve_v2x8.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S index 60e1f347b..c969ed4db 100644 --- a/kernel/arm64/sgemm_kernel_sve_v2x8.S +++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S @@ -55,8 +55,8 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ #define lanes x15 #define pA1 x16 #define pA2 x17 -#define alpha w19 -#define vec_len x20 +#define alpha w18 +#define vec_len x19 #define vec_lenx2 x20 #define alpha0 s10 From 32f2fafde75ca674401c1ce4bc4301ca271536fa Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Thu, 24 Nov 2022 13:38:20 +0000 Subject: [PATCH 043/718] Propagate SWITCH_RATIO to DYNAMIC_ARCH builds Previously dynamic builds were either using the default SWITCH_RATIO or one from the higher level architecture; this patch ensures the dynamic builds can use this parameter as well. --- common_param.h | 2 ++ driver/level3/level3_gemm3m_thread.c | 15 ++++++++------ driver/level3/level3_syrk_threaded.c | 13 +++++++----- driver/level3/level3_thread.c | 31 +++++++++++++++++----------- kernel/setparam-ref.c | 5 ++++- lapack/potrf/potrf_parallel.c | 4 ---- param.h | 4 ++++ 7 files changed, 46 insertions(+), 28 deletions(-) diff --git a/common_param.h b/common_param.h index e14ef2782..7230dd00d 100644 --- a/common_param.h +++ b/common_param.h @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -45,6 +46,7 @@ typedef struct { int dtb_entries; + int switch_ratio; int offsetA, offsetB, align; #ifdef BUILD_BFLOAT16 diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 39824fc5a..26d07fa94 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -44,10 +45,6 @@ #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG divN, divT; int mode; +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif + if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); @@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } */ - if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { + if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) { GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } @@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO divT = nthreads; divN = 1; - while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) { + while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) { do { divT --; divN = 1; diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index d7dcd68a3..b03577fb3 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -44,10 +45,6 @@ #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - //The array of job_t may overflow the stack. //Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD @@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO int mode, mask; double dnum, di, dinum; - if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif + + if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) { SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 02b60b50d..c9ecf73e8 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -44,10 +45,6 @@ #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - #ifndef GEMM_PREFERED_SIZE #define GEMM_PREFERED_SIZE 1 #endif @@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); BLASLONG width, i, j, k, js; BLASLONG m, n, n_from, n_to; int mode; +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif /* Get execution mode */ #ifndef COMPLEX @@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); num_parts = 0; while (n > 0){ width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); - if (width < SWITCH_RATIO) { - width = SWITCH_RATIO; + if (width < switch_ratio) { + width = switch_ratio; } width = round_up(n, width, GEMM_PREFERED_SIZE); @@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF BLASLONG m = args -> m; BLASLONG n = args -> n; BLASLONG nthreads_m, nthreads_n; +#if defined(DYNAMIC_ARCH) + int switch_ratio = gotoblas->switch_ratio; +#else + int switch_ratio = SWITCH_RATIO; +#endif /* Get dimensions from index ranges if available */ if (range_m) { @@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF n = range_n[1] - range_n[0]; } - /* Partitions in m should have at least SWITCH_RATIO rows */ - if (m < 2 * SWITCH_RATIO) { + /* Partitions in m should have at least switch_ratio rows */ + if (m < 2 * switch_ratio) { nthreads_m = 1; } else { nthreads_m = args -> nthreads; - while (m < nthreads_m * SWITCH_RATIO) { + while (m < nthreads_m * switch_ratio) { nthreads_m = nthreads_m / 2; } } - /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */ - if (n < SWITCH_RATIO * nthreads_m) { + /* Partitions in n should have at most switch_ratio * nthreads_m columns */ + if (n < switch_ratio * nthreads_m) { nthreads_n = 1; } else { - nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m); + nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m); if (nthreads_m * nthreads_n > args -> nthreads) { nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); } diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 522c6d7d9..79436f43b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -49,7 +50,9 @@ static void init_parameter(void); gotoblas_t TABLE_NAME = { - DTB_DEFAULT_ENTRIES , + DTB_DEFAULT_ENTRIES, + + SWITCH_RATIO, GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index 29364cc05..a7c28f4c2 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -80,10 +80,6 @@ static FLOAT dm1 = -1.; #define DIVIDE_RATE 2 #endif -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - #ifndef LOWER #define TRANS #endif diff --git a/param.h b/param.h index 19cbe75a5..aec1b6a92 100644 --- a/param.h +++ b/param.h @@ -3838,6 +3838,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #endif +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + #ifndef QGEMM_DEFAULT_UNROLL_M #define QGEMM_DEFAULT_UNROLL_M 2 #endif From 5b165420b5962b2b73319f55b747be4f6c697860 Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Mon, 5 Dec 2022 15:17:52 +0000 Subject: [PATCH 044/718] SWITCH_RATIO for Arm(R) Neoverse(TM) architecture This seems like a good balance of values for reasonably sized matrices. With `SWITCH_RATIO=16` the DGEMM scales better to bigger sizes but the better solution would be some kind of thread throttling so I've gone with `SWITCH_RATIO=8`. --- param.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index f1f5cbdad..ae391dd3f 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2014, The OpenBLAS Project +Copyright (c) 2011-2023, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -3338,6 +3338,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN1) +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3367,7 +3373,11 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEV1) -#define SWITCH_RATIO 16 +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3398,6 +3408,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN2) +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + #undef SBGEMM_ALIGN_K #define SBGEMM_ALIGN_K 4 From ec334e69dc1db8ad607ffedd7bcac53d29026ac8 Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Mon, 17 Apr 2023 17:38:42 +0100 Subject: [PATCH 045/718] Use SVE kernel for SGEMM/DGEMM on Arm(R) Neoverse(TM) V1 This re-spins #3869 with some additional copy unrolling which helps maintain SYRK performance. After #3868, the SVE kernels represent a pretty good boost. This re-uses ARMV8SVE as a base and I'm going to incrementally move everything to use ARMV8SVE in additional patches (as well as fix up anything that's not already in ARMV8SVE). --- benchmark/syrk.c | 6 +- kernel/arm64/KERNEL.ARMV8SVE | 12 +-- kernel/arm64/KERNEL.NEOVERSEV1 | 167 +++++++---------------------- kernel/arm64/dgemm_ncopy_sve_v1.c | 79 -------------- kernel/arm64/dgemm_tcopy_sve_v1.c | 77 ------------- kernel/arm64/gemm_ncopy_sve_v1x8.c | 131 ++++++++++++++++++++++ kernel/arm64/gemm_tcopy_sve_v1x8.c | 125 +++++++++++++++++++++ kernel/arm64/sgemm_ncopy_sve_v1.c | 78 -------------- kernel/arm64/sgemm_tcopy_sve_v1.c | 77 ------------- param.h | 8 +- 10 files changed, 307 insertions(+), 453 deletions(-) delete mode 100644 kernel/arm64/dgemm_ncopy_sve_v1.c delete mode 100644 kernel/arm64/dgemm_tcopy_sve_v1.c create mode 100644 kernel/arm64/gemm_ncopy_sve_v1x8.c create mode 100644 kernel/arm64/gemm_tcopy_sve_v1x8.c delete mode 100644 kernel/arm64/sgemm_ncopy_sve_v1.c delete mode 100644 kernel/arm64/sgemm_tcopy_sve_v1.c diff --git a/benchmark/syrk.c b/benchmark/syrk.c index fa0f24666..e0ae58707 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2014, 2023 The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -67,7 +67,7 @@ int main(int argc, char *argv[]){ int step = 1; int loops = 1; - if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p); double time1,timeg; @@ -77,7 +77,7 @@ int main(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index bd25f7cd8..07393624c 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -128,10 +128,10 @@ SGEMM_BETA = sgemm_beta.S SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S +SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c +SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) @@ -149,8 +149,8 @@ SSYMMLCOPY_M = symm_lcopy_sve.c DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c +DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 9a5938459..d6617e8a4 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,75 +1,6 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c +include $(KERNELDIR)/KERNEL.ARMV8SVE -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c @@ -100,67 +31,20 @@ DDOTKERNEL = dot.c SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ifeq ($(SGEMM_UNROLL_N), 16) -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -else -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -endif -ifeq ($(SGEMM_UNROLL_N), 4) -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -else -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -endif -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CTRMMUNCOPY_M = +CTRMMLNCOPY_M = +CTRMMUTCOPY_M = +CTRMMLTCOPY_M = +CHEMMLTCOPY_M = +CHEMMUTCOPY_M = +CSYMMUCOPY_M = +CSYMMLCOPY_M = CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S @@ -169,12 +53,34 @@ CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +else +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = endif CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMCOPYLN_M = +ZTRSMCOPYLT_M = +ZTRSMCOPYUN_M = +ZTRSMCOPYUT_M = + +ZTRMMUNCOPY_M = +ZTRMMLNCOPY_M = +ZTRMMUTCOPY_M = +ZTRMMLTCOPY_M = +ZHEMMLTCOPY_M = +ZHEMMUTCOPY_M = +ZSYMMUCOPY_M = +ZSYMMLCOPY_M = + ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) @@ -182,6 +88,9 @@ ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +else +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = endif ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c deleted file mode 100644 index 1f812c775..000000000 --- a/kernel/arm64/dgemm_ncopy_sve_v1.c +++ /dev/null @@ -1,79 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -// TODO: write in assembly with proper unrolling of inner loop -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; - - svint64_t lda_vec = svindex_s64(0LL, lda); - uint64_t sve_size = svcntd(); - - aoffset = a; - boffset = b; - - j = 0; - svbool_t pg = svwhilelt_b64(j, n); - uint64_t active = svcntp_b64(svptrue_b64(), pg); - do { - - aoffset1 = aoffset; - - uint64_t i_cnt = m; - while (i_cnt--) { - svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); - svst1_f64(pg, (double *) boffset, a_vec); - aoffset1++; - boffset += active; - } - aoffset += sve_size * lda; - - j += svcntd(); - pg = svwhilelt_b64(j, n); - active = svcntp_b64(svptrue_b64(), pg); - - - } while (svptest_any(svptrue_b64(), pg)); - - return 0; -} diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c deleted file mode 100644 index cb645a1b6..000000000 --- a/kernel/arm64/dgemm_tcopy_sve_v1.c +++ /dev/null @@ -1,77 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -// TODO: write in assembly with proper unrolling of inner loop -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; - - uint64_t sve_size = svcntd(); - - aoffset = a; - boffset = b; - - j = 0; - svbool_t pg = svwhilelt_b64(j, n); - uint64_t active = svcntp_b64(svptrue_b64(), pg); - do { - - aoffset1 = aoffset; - - uint64_t i_cnt = m; - while (i_cnt--) { - svfloat64_t a_vec = svld1(pg, (double *)aoffset1); - svst1_f64(pg, (double *) boffset, a_vec); - aoffset1 += lda; - boffset += active; - } - aoffset += sve_size; - - j += svcntd(); - pg = svwhilelt_b64(j, n); - active = svcntp_b64(svptrue_b64(), pg); - - } while (svptest_any(svptrue_b64(), pg)); - - return 0; -} diff --git a/kernel/arm64/gemm_ncopy_sve_v1x8.c b/kernel/arm64/gemm_ncopy_sve_v1x8.c new file mode 100644 index 000000000..113b1ee40 --- /dev/null +++ b/kernel/arm64/gemm_ncopy_sve_v1x8.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64_t +#define SV_INDEX svuint64_t +#define SV_INDEXER svindex_u64 +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#define SV_PREFETCH svprfd_gather_index +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32_t +#define SV_INDEX svuint32_t +#define SV_INDEXER svindex_u32 +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#define SV_PREFETCH svprfw_gather_index +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec); \ + svst1(pg, b_offset, a_vec); \ + a_offset_inner++; \ + b_offset += active; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + uint64_t sve_size; + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_INDEX lda_vec = SV_INDEXER(0LL, lda); + SV_TYPE a_vec; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 3; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 4) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * lda; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE(0L, remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} diff --git a/kernel/arm64/gemm_tcopy_sve_v1x8.c b/kernel/arm64/gemm_tcopy_sve_v1x8.c new file mode 100644 index 000000000..68a2cc07c --- /dev/null +++ b/kernel/arm64/gemm_tcopy_sve_v1x8.c @@ -0,0 +1,125 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld1(pg, a_offset_inner); \ + svst1(pg, b_offset, a_vec); \ + a_offset_inner += lda; \ + b_offset += active; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + uint64_t sve_size = svcntw(); + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_TYPE a_vec; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 3; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 4) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE(0L, remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + diff --git a/kernel/arm64/sgemm_ncopy_sve_v1.c b/kernel/arm64/sgemm_ncopy_sve_v1.c deleted file mode 100644 index 1bc186335..000000000 --- a/kernel/arm64/sgemm_ncopy_sve_v1.c +++ /dev/null @@ -1,78 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -// TODO: write in assembly with proper unrolling of inner loop -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; - - svint32_t lda_vec = svindex_s32(0LL, lda); - uint32_t sve_size = svcntw(); - - aoffset = a; - boffset = b; - - j = 0; - svbool_t pg = svwhilelt_b32(j, n); - uint32_t active = svcntp_b32(svptrue_b32(), pg); - do { - - aoffset1 = aoffset; - - uint32_t i_cnt = m; - while (i_cnt--) { - svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec); - svst1_f32(pg, (float *) boffset, a_vec); - aoffset1++; - boffset += active; - } - aoffset += sve_size * lda; - - j += svcntw(); - pg = svwhilelt_b32(j, n); - active = svcntp_b32(svptrue_b32(), pg); - - } while (svptest_any(svptrue_b32(), pg)); - - return 0; -} diff --git a/kernel/arm64/sgemm_tcopy_sve_v1.c b/kernel/arm64/sgemm_tcopy_sve_v1.c deleted file mode 100644 index 9f8cf502a..000000000 --- a/kernel/arm64/sgemm_tcopy_sve_v1.c +++ /dev/null @@ -1,77 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -// TODO: write in assembly with proper unrolling of inner loop -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; - - uint32_t sve_size = svcntw(); - - aoffset = a; - boffset = b; - - j = 0; - svbool_t pg = svwhilelt_b32(j, n); - uint32_t active = svcntp_b32(svptrue_b32(), pg); - do { - - aoffset1 = aoffset; - - uint32_t i_cnt = m; - while (i_cnt--) { - svfloat32_t a_vec = svld1(pg, (float *) aoffset1); - svst1_f32(pg, (float *) boffset, a_vec); - aoffset1 += lda; - boffset += active; - } - aoffset += sve_size; - - j += svcntw(); - pg = svwhilelt_b32(j, n); - active = svcntp_b32(svptrue_b32(), pg); - - } while (svptest_any(svptrue_b32(), pg)); - - return 0; -} diff --git a/param.h b/param.h index f1f5cbdad..99ff7cb4c 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2014, The OpenBLAS Project +Copyright (c) 2011-2023, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -3370,10 +3370,10 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define SWITCH_RATIO 16 #define SGEMM_DEFAULT_UNROLL_M 16 -#define SGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 // Actually 2VL (8) but kept separate to keep copies separate +#define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 From 479509bb37d5cd26baa62462abe461a0a1d43bb2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Apr 2023 21:57:25 +0200 Subject: [PATCH 046/718] Remove any stray trailing dash from CROSS_SUFFIX (as would result from clang -arch) --- c_check | 1 + 1 file changed, 1 insertion(+) diff --git a/c_check b/c_check index 9be152b12..232adba67 100755 --- a/c_check +++ b/c_check @@ -40,6 +40,7 @@ bn=`basename \"$compiler_name\"` case "$bn" in *-*) if [ "$bn" != '-' ]; then cross_suffix="$cross_suffix${bn%-*}-" + cross_suffix=`echo $cross_suffix|sed -e 's/ -$//'` fi esac From 7de9335c56aed6fb7fb1590c4d146338ca666726 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:42:09 +0200 Subject: [PATCH 047/718] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/sgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index c9681fa8b..296eded5a 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From 99f6d31ed52822ec69bba1f225ef889e99d38f99 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:42:55 +0200 Subject: [PATCH 048/718] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/sgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 07aa51503..ea89a2aaf 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From d18efaed20e4ed48bba3777a091fa4e49e35b67f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:43:43 +0200 Subject: [PATCH 049/718] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/ssymv_L.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 45914daf5..4826b00c6 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From bb6d6735bf094b8f1bf6fb7a986f5360c0baf2c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:44:15 +0200 Subject: [PATCH 050/718] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/ssymv_U.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 26e5ca7e9..06db14ebe 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From 66b39b835c33b1f9300d042010d255f1465a8e3e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:44:45 +0200 Subject: [PATCH 051/718] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/zdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 27397ccfa..72a712a9e 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From c2fe9cb91fab22ff00a3a660aa106cfe6b9e132f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:45:14 +0200 Subject: [PATCH 052/718] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/zgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 8fc960610..678cea957 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From c9174ae8d7e385c9fd030d263f6ff5e07aa9b2ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Apr 2023 23:45:44 +0200 Subject: [PATCH 053/718] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/zgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 63c8b11a4..44d545df7 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#if (defined(__GNUC__) && __GNUC__ > 11) #pragma GCC optimize("no-tree-vectorize") #endif From 84bcf6639fb2de9ae15350a6b3979bf5757bae85 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 20 Apr 2023 23:24:52 +0200 Subject: [PATCH 054/718] Disable gcc's tree-vectorizer pass on all operating systems --- kernel/x86_64/cgemv_n_4.c | 5 +++-- kernel/x86_64/cgemv_t_4.c | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 3ca173c20..94e6d2c77 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -24,9 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif -#include -#include #include "common.h" #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index 3187e196c..f123e81d1 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -24,7 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - +#if (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #include "common.h" From 67d33e5b9878a42e0b61151a6bd40ac728fbe19c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Fri, 21 Apr 2023 13:02:23 +0200 Subject: [PATCH 055/718] CI (MSYS2): Update location of compiler cache. --- .github/workflows/dynamic_arch.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index ff40b354d..fdfad3365 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -217,11 +217,16 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 + - name: Prepare ccache + # Get cache location of ccache + id: ccache-prepare + run: | + echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT + - name: Compilation cache uses: actions/cache@v3 with: - # It looks like this path needs to be hard-coded. - path: C:/msys64/home/runneradmin/.ccache + path: ${{ steps.ccache-prepare.outputs.ccachedir }} # We include the commit sha in the cache key, as new cache entries are # only created if there is no existing entry for the key yet. key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }} @@ -234,9 +239,10 @@ jobs: # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota. run: | which ccache - test -d ~/.ccache || mkdir -p ~/.ccache - echo "max_size = 250M" > ~/.ccache/ccache.conf - echo "compression = true" >> ~/.ccache/ccache.conf + test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }} + echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf + echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf + ccache -p ccache -s echo $HOME cygpath -w $HOME From e27e9a50b1d831d2b02a7d1c7477fab5ab9f460d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Fri, 21 Apr 2023 14:10:40 +0200 Subject: [PATCH 056/718] CI (MSYS2): Save ccache before running tests. --- .github/workflows/dynamic_arch.yml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index fdfad3365..b537c8494 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -219,17 +219,19 @@ jobs: - name: Prepare ccache # Get cache location of ccache + # Create key that is used in action/cache/restore and action/cache/save steps id: ccache-prepare run: | echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT - - name: Compilation cache - uses: actions/cache@v3 + - name: Restore ccache + uses: actions/cache/restore@v3 with: path: ${{ steps.ccache-prepare.outputs.ccachedir }} - # We include the commit sha in the cache key, as new cache entries are - # only created if there is no existing entry for the key yet. - key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }} + key: ${{ steps.ccache-prepare.outputs.key }} # Restore a matching ccache cache entry. Prefer same branch. restore-keys: | ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }} @@ -270,6 +272,13 @@ jobs: continue-on-error: true run: ccache -s + - name: Save ccache + # Save the cache after we are done (successfully) building + uses: actions/cache/save@v3 + with: + path: ${{ steps.ccache-prepare.outputs.ccachedir }} + key: ${{ steps.ccache-prepare.outputs.key }} + - name: Run tests timeout-minutes: 60 run: cd build && ctest From f2c6a34de9836ff9c2b3684f5ce19e08098f373e Mon Sep 17 00:00:00 2001 From: lilinjie Date: Mon, 24 Apr 2023 10:29:59 +0800 Subject: [PATCH 057/718] fix typo Signed-off-by: lilinjie --- GotoBLAS_06WeirdPerformance.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GotoBLAS_06WeirdPerformance.txt b/GotoBLAS_06WeirdPerformance.txt index 05766e17b..0f7cec5c9 100644 --- a/GotoBLAS_06WeirdPerformance.txt +++ b/GotoBLAS_06WeirdPerformance.txt @@ -11,7 +11,7 @@ operation is finished. -2. Simlar problem may happen under virtual machine. If supervisor +2. Similar problem may happen under virtual machine. If supervisor allocates different cores for each scheduling, BLAS performnace will be bad. This is because BLAS also utilizes all cache, unexpected re-schedule for different core may result of heavy From 7152d6b06d549e7f1b474c0eedcf0c8bba5d9421 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Apr 2023 08:36:20 +0200 Subject: [PATCH 058/718] fix cblas_gemmt --- interface/gemmt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index d35406411..0ec754fb9 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -177,7 +177,7 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, blasint k, #ifndef COMPLEX FLOAT alpha, @@ -392,7 +392,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif - if ((m == 0) ) + if (m == 0) return; IDEBUG_START; From 1f6f7328ebcada538eb30894e46d89f188e3a217 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Apr 2023 09:14:12 +0200 Subject: [PATCH 059/718] remove redundant declaration --- interface/gemmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index 0ec754fb9..cebc7918d 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int transa, transb, uplo; blasint info; - blasint m, lda, ldb; + blasint lda, ldb; FLOAT *a, *b; XFLOAT *buffer; From 7b16c4c051bfafcf8f926daec6bdbe48983308cf Mon Sep 17 00:00:00 2001 From: Han Gao Date: Fri, 28 Apr 2023 04:32:06 +0000 Subject: [PATCH 060/718] CI (C910V): add test Signed-off-by: Han Gao --- .github/workflows/c910v.yml | 121 ++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 .github/workflows/c910v.yml diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml new file mode 100644 index 000000000..199304fb1 --- /dev/null +++ b/.github/workflows/c910v.yml @@ -0,0 +1,121 @@ +name: c910v qemu test + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + TEST: + runs-on: ubuntu-latest + env: + xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282 + toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz + strategy: + fail-fast: false + matrix: + include: + - target: RISCV64_GENERIC + triple: riscv64-linux-gnu + apt_triple: riscv64-linux-gnu + opts: NO_SHARED=1 TARGET=RISCV64_GENERIC + - target: C910V + triple: riscv64-unknown-linux-gnu + apt_triple: riscv64-linux-gnu + opts: NO_SHARED=1 TARGET=C910V + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: install build deps + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ + gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross + + - name: checkout qemu + uses: actions/checkout@v3 + with: + repository: T-head-Semi/qemu + path: qemu + ref: 1e692ebb43d396c52352406323fc782c1ac99a42 + + - name: build qemu + run: | + # Force use c910v qemu-user + wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch + cd qemu + patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system + make -j$(nproc) + make install + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: build OpenBLAS + run: | + wget ${xuetie_toolchain}/${toolchain_file_name} + tar -xvf ${toolchain_file_name} -C /opt + export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH" + + make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH + qemu-riscv64 ./utest/openblas_utest + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat From b1781ad338842d4e91d8c851986e8e1a78986fe8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Korinsky" Date: Sat, 29 Apr 2023 11:13:26 +0200 Subject: [PATCH 061/718] Do not requires GNU mktemp Historically the GNU mktemp was the first one which doesn't requires `-t` to create a directory. Here I've introduced a fallback when `-t` is required. For example MacPorts contains similar patch: https://github.com/macports/macports-ports/blob/bbe8abfe26651cc46d496bf48ecda506faa40a4a/math/OpenBLAS/files/patch-MacOSX-mktemp.diff --- c_check | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/c_check b/c_check index 232adba67..8b633ab8b 100755 --- a/c_check +++ b/c_check @@ -168,7 +168,7 @@ fi no_msa=0 if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then - tmpd="$(mktemp -d)" + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" code='"addvi.b $w0, $w1, 1"' msa_flags='-mmsa -mfp64 -mload-store-pairs' @@ -208,7 +208,7 @@ esac no_avx512=0 if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then - tmpd=`mktemp -d` + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" code='"vbroadcastss -4 * 4(%rsi), %zmm2"' printf "#include \n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf" @@ -229,7 +229,7 @@ fi no_rv64gv=0 if [ "$architecture" = "riscv64" ]; then - tmpd=`mktemp -d` + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" code='"vsetvli zero, zero, e8, m1\n"' printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf" @@ -245,7 +245,7 @@ fi no_sve=0 if [ "$architecture" = "arm64" ]; then - tmpd=`mktemp -d` + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" printf "#include \n\n int main(void){}\n">> "$tmpf" args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf" @@ -261,7 +261,7 @@ fi c11_atomics=0 case "$data" in *HAVE_C11*) - tmpd=`mktemp -d` + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" printf "#include \nint main(void){}\n" >> "$tmpf" args=" -c -o $tmpf.o $tmpf" From e5538a62cb71c50623e117888a85ba26cd11dc9d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 May 2023 22:56:39 +0200 Subject: [PATCH 062/718] Add suggestions to NUM_THREADS/auxiliary buffer message --- driver/others/memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index 4493b7d71..3cbd17bc2 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -3015,6 +3015,8 @@ void *blas_memory_alloc(int procpos){ #endif if (memory_overflowed) goto terminate; fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); + fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n"); + fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", NUM_BUFFERS); memory_overflowed=1; new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); From 60226b35e1e24a64be69c7f65934286df2dcfedb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 May 2023 12:44:38 +0200 Subject: [PATCH 063/718] Fix (redundant) lapack-runtest target --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 3c4b8948a..299970c67 100644 --- a/Makefile +++ b/Makefile @@ -373,10 +373,10 @@ ifneq ($(CROSS), 1) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING) endif -lapack-runtest: +lapack-runtest: lapack-test ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ ./testsecond; ./testdsecnd; ./testieee; ./testversion ) - (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) + (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING ) blas-test: From c2078b2356fdad2badfd385cae50ea8499c2c6a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 May 2023 20:15:13 +0200 Subject: [PATCH 064/718] Mark xerbla's arguments as const --- cblas.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cblas.h b/cblas.h index c2bdd27fa..8a5055cf8 100644 --- a/cblas.h +++ b/cblas.h @@ -350,7 +350,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc); -void cblas_xerbla(blasint p, char *rout, char *form, ...); +void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...); /*** BLAS extensions ***/ From c74ee11376600ecf546240a78c6bf6a9728b91e0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 May 2023 14:24:38 +0200 Subject: [PATCH 065/718] Add an M1-based OSX crossbuild and a NeoverseN1 build to CIRRUS CI (#3997) * Add an M1-based OSX crossbuild and a NeoverseN1 build (plus Windows//LLVM commented out for now) --- .cirrus.yml | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index d16eb811a..b8f03acaf 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -2,9 +2,65 @@ macos_instance: image: ghcr.io/cirruslabs/macos-monterey-xcode:latest task: + name: AppleM1/LLVM compile_script: - brew install llvm - export PATH=/opt/homebrew/opt/llvm/bin:$PATH - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - make TARGET=VORTEX USE_OPENMP=1 CC=clang + +macos_instance: + image: ghcr.io/cirruslabs/macos-monterey-xcode:13.4 +task: + name: AppleM1/LLVM x86_64 xbuild + compile_script: + - #brew install llvm + - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH + - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - export ARCHS="i386 x86_64" + - export ARCHS_STANDARD="i386 x86_64" + - export ARCHS_STANDARD_32_64_BIT="i386 x86_64" + - export ARCHS_STANDARD_64_BIT=x86_64 + - export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64" + - export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64" + - export VALID_ARCHS="i386 x86_64" + - #find /Applications/Xcode-13.4.1.app -name libunwind.dylib + - export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator15.5.sdk -arch x86_64 -miphoneos-version-min=10.0" + - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + always: + config_artifacts: + path: "*conf*" + type: text/plain +# lib_artifacts: +# path: "libopenblas*" +# type: application/octet-streamm + + +task: + name: NeoverseN1 + arm_container: + image: node:latest + compile_script: + - make + + +#task: +# name: Windows/LLVM16 --- too slow --- +# windows_container: +# image: cirrusci/windowsservercore:cmake-2021.12.07 +# install_script: +# - choco list --localonly +# - choco install -y llvm +# - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"' +# - choco install -y ninja +# - refreshenv +# - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build" +# - vcvarsall x64 +# - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build" +# - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release +# - cd build +# - cmake --build . +# - ctest From 33ab415f68e90fbe78874e4b18fee185e998fb96 Mon Sep 17 00:00:00 2001 From: Ken Ho Date: Mon, 8 May 2023 14:43:56 -0700 Subject: [PATCH 066/718] Bug fix and improvements for [z]imatcopy interface. --- interface/imatcopy.c | 34 ++++++++--------- interface/zimatcopy.c | 89 +++++++++++++++++++------------------------ 2 files changed, 57 insertions(+), 66 deletions(-) diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 91975f7f4..495243949 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -120,17 +120,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } + #ifdef NEW_IMATCOPY - if ( *lda == *ldb && *rows == *cols) { + if ( *lda == *ldb ) { if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda ); + return; } - else + else if ( *rows == *cols ) { IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda ); + return; } } else @@ -138,21 +141,18 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( trans == BlasNoTrans ) { IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda ); + return; } - else + else if ( *rows == *cols ) { IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda ); + return; } } - return; } - #endif - if ( *lda > *ldb ) - msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT); - else - msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT); + msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) @@ -165,26 +165,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, { if ( trans == BlasNoTrans ) { - OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb ); - OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb ); + OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *rows ); + OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *rows, a, *ldb ); } else { - OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb ); - OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb ); + OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *cols ); + OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *cols, a, *ldb ); } } else { if ( trans == BlasNoTrans ) { - OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb ); - OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb ); + OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *cols ); + OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *cols, a, *ldb ); } else { - OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb ); - OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb ); + OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *rows ); + OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *rows, a, *ldb ); } } diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index ecda5ef4e..fcad398ba 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -125,27 +125,33 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } + #ifdef NEW_IMATCOPY - if (*lda == *ldb && *cols == *rows) { + if (*lda == *ldb ) { if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } if ( trans == BlasConj ) { IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } - if ( trans == BlasTrans ) + if ( trans == BlasTrans && *rows == *cols ) { IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } - if ( trans == BlasTransConj ) + if ( trans == BlasTransConj && *rows == *cols ) { IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } + } else { @@ -153,67 +159,59 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( trans == BlasNoTrans ) { IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } if ( trans == BlasConj ) { IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } - if ( trans == BlasTrans ) + if ( trans == BlasTrans && *rows == *cols ) { IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } - if ( trans == BlasTransConj ) + if ( trans == BlasTransConj && *rows == *cols ) { IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + return; } + } - return; } #endif - if ( *lda > *ldb ) - msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2; - else - msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2; - - b = malloc(msize); - if ( b == NULL ) - { - printf("Memory alloc failed in zimatcopy\n"); - exit(1); - } + msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2; + b = malloc(msize); + if ( b == NULL ) + { + printf("Memory alloc failed\n"); + exit(1); + } if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { - OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); + OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } if ( trans == BlasConj ) { - OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); + OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } if ( trans == BlasTrans ) { - OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); + OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } if ( trans == BlasTransConj ) { - OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); + OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } } @@ -222,34 +220,27 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( trans == BlasNoTrans ) { - OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); + OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } if ( trans == BlasConj ) { - OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); + OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } if ( trans == BlasTrans ) { - OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); + OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } if ( trans == BlasTransConj ) { - OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); - OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); - free(b); - return; + OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); + OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } } + free(b); return; From 7a86c437b52c0664889769b663f3937cf45a2d94 Mon Sep 17 00:00:00 2001 From: Ken Ho Date: Wed, 10 May 2023 09:13:04 -0700 Subject: [PATCH 067/718] Change some "if" statements to "else if" following suggestion by @mmuetzel. --- interface/zimatcopy.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index fcad398ba..34454c582 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -198,17 +198,17 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } - if ( trans == BlasConj ) + else if ( trans == BlasConj ) { OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } - if ( trans == BlasTrans ) + else if ( trans == BlasTrans ) { OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } - if ( trans == BlasTransConj ) + else if ( trans == BlasTransConj ) { OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); @@ -223,17 +223,17 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } - if ( trans == BlasConj ) + else if ( trans == BlasConj ) { OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols ); OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb ); } - if ( trans == BlasTrans ) + else if ( trans == BlasTrans ) { OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); } - if ( trans == BlasTransConj ) + else if ( trans == BlasTransConj ) { OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows ); OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb ); From df1b1f6a91461c5e0e7b3564693e4474d744bf19 Mon Sep 17 00:00:00 2001 From: Ken Ho Date: Fri, 12 May 2023 09:41:52 -0700 Subject: [PATCH 068/718] More detailed error message in [z]imatcopy.c. --- interface/imatcopy.c | 2 +- interface/zimatcopy.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 495243949..c4417e99c 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -157,7 +157,7 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, b = malloc(msize); if ( b == NULL ) { - printf("Memory alloc failed\n"); + printf("Memory alloc failed in imatcopy\n"); exit(1); } diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 34454c582..b0b32dc87 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -186,7 +186,7 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, b = malloc(msize); if ( b == NULL ) { - printf("Memory alloc failed\n"); + printf("Memory alloc failed in zimatcopy\n"); exit(1); } From e2779c852f0630b6ff6a2eec9e8cfbb014e300f7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 15 May 2023 20:49:56 +0200 Subject: [PATCH 069/718] Do not build the tests when only the CBLAS interface is selected (#4041) * Do not build the tests when only the CBLAS interface is selected --- CMakeLists.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d59290c90..cc964b76e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -311,19 +311,25 @@ endif() #if (MSVC OR NOT NOFORTRAN) if (NOT NO_CBLAS) + if (NOT ONLY_CBLAS) # Broken without fortran on unix - add_subdirectory(utest) + add_subdirectory(utest) +endif() endif() if (NOT NOFORTRAN) + if (NOT ONLY_CBLAS) # Build test and ctest add_subdirectory(test) + endif() if (BUILD_TESTING) add_subdirectory(lapack-netlib/TESTING) endif() endif() if(NOT NO_CBLAS) + if (NOT ONLY_CBLAS) add_subdirectory(ctest) + endif() endif() if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) add_subdirectory(cpp_thread_test) From 86f48997c749d54a7c13193c500247c5610bb9f6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 16 May 2023 12:01:50 +0200 Subject: [PATCH 070/718] CirrusCI: Add Neoverse build with OpenMP (#4042) * Add Neoverse build with OpenMP --- .cirrus.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index b8f03acaf..7ca1cc7bc 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -46,7 +46,14 @@ task: compile_script: - make - +task: + name: NeoverseN1-OMP + arm_container: + image: node:latest + cpu: 8 + compile_script: + - make USE_OPENMP=1 + #task: # name: Windows/LLVM16 --- too slow --- # windows_container: From c5f7e46526a602689f1b19e0598a411fb0a33111 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 16 May 2023 19:54:42 +0200 Subject: [PATCH 071/718] Fix typos and errors in comments (Reference-LAPACK 809) --- lapack-netlib/cgbsvx.f | 644 ++++++++++++ lapack-netlib/cgejsv.f | 2232 +++++++++++++++++++++++++++++++++++++++ lapack-netlib/cgesvx.f | 602 +++++++++++ lapack-netlib/dgbsvx.f | 639 ++++++++++++ lapack-netlib/dgejsv.f | 1780 ++++++++++++++++++++++++++++++++ lapack-netlib/dgesvx.f | 599 +++++++++++ lapack-netlib/sgbsvx.f | 641 ++++++++++++ lapack-netlib/sgesvx.f | 599 +++++++++++ lapack-netlib/zgbsvx.f | 644 ++++++++++++ lapack-netlib/zgejsv.f | 2234 ++++++++++++++++++++++++++++++++++++++++ lapack-netlib/zgesvx.f | 602 +++++++++++ 11 files changed, 11216 insertions(+) create mode 100644 lapack-netlib/cgbsvx.f create mode 100644 lapack-netlib/cgejsv.f create mode 100644 lapack-netlib/cgesvx.f create mode 100644 lapack-netlib/dgbsvx.f create mode 100644 lapack-netlib/dgejsv.f create mode 100644 lapack-netlib/dgesvx.f create mode 100644 lapack-netlib/sgbsvx.f create mode 100644 lapack-netlib/sgesvx.f create mode 100644 lapack-netlib/zgbsvx.f create mode 100644 lapack-netlib/zgejsv.f create mode 100644 lapack-netlib/zgesvx.f diff --git a/lapack-netlib/cgbsvx.f b/lapack-netlib/cgbsvx.f new file mode 100644 index 000000000..eaab5682c --- /dev/null +++ b/lapack-netlib/cgbsvx.f @@ -0,0 +1,644 @@ +*> \brief CGBSVX computes the solution to system of linear equations A * X = B for GB matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGBSVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, +* LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, +* RCOND, FERR, BERR, WORK, RWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS +* REAL RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ) +* REAL BERR( * ), C( * ), FERR( * ), R( * ), +* $ RWORK( * ) +* COMPLEX AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGBSVX uses the LU factorization to compute the solution to a complex +*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B, +*> where A is a band matrix of order N with KL subdiagonals and KU +*> superdiagonals, and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed by this subroutine: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = L * U, +*> where L is a product of permutation and unit lower triangular +*> matrices with KL subdiagonals, and U is upper triangular with +*> KL+KU superdiagonals. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AFB and IPIV contain the factored form of +*> A. If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> AB, AFB, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AFB and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AFB and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations. +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KL +*> \verbatim +*> KL is INTEGER +*> The number of subdiagonals within the band of A. KL >= 0. +*> \endverbatim +*> +*> \param[in] KU +*> \verbatim +*> KU is INTEGER +*> The number of superdiagonals within the band of A. KU >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX array, dimension (LDAB,N) +*> On entry, the matrix A in band storage, in rows 1 to KL+KU+1. +*> The j-th column of A is stored in the j-th column of the +*> array AB as follows: +*> AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl) +*> +*> If FACT = 'F' and EQUED is not 'N', then A must have been +*> equilibrated by the scaling factors in R and/or C. AB is not +*> modified if FACT = 'F' or 'N', or if FACT = 'E' and +*> EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] AFB +*> \verbatim +*> AFB is COMPLEX array, dimension (LDAFB,N) +*> If FACT = 'F', then AFB is an input argument and on entry +*> contains details of the LU factorization of the band matrix +*> A, as computed by CGBTRF. U is stored as an upper triangular +*> band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1, +*> and the multipliers used during the factorization are stored +*> in rows KL+KU+2 to 2*KL+KU+1. If EQUED .ne. 'N', then AFB is +*> the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AFB is an output argument and on exit +*> returns details of the LU factorization of A. +*> +*> If FACT = 'E', then AFB is an output argument and on exit +*> returns details of the LU factorization of the equilibrated +*> matrix A (see the description of AB for the form of the +*> equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAFB +*> \verbatim +*> LDAFB is INTEGER +*> The leading dimension of the array AFB. LDAFB >= 2*KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = L*U +*> as computed by CGBTRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is REAL array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,NRHS) +*> On entry, the right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is COMPLEX array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is REAL +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is REAL array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is REAL array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (MAX(1,N)) +*> On exit, RWORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If RWORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 RWORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization +*> has been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexGBsolve +* +* ===================================================================== + SUBROUTINE CGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, + $ LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, + $ RCOND, FERR, BERR, WORK, RWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS + REAL RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL BERR( * ), C( * ), FERR( * ), R( * ), + $ RWORK( * ) + COMPLEX AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* Moved setting of INFO = N+1 so INFO does not subsequently get +* overwritten. Sven, 17 Mar 05. +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J, J1, J2 + REAL AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + REAL CLANGB, CLANTB, SLAMCH + EXTERNAL LSAME, CLANGB, CLANTB, SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL CCOPY, CGBCON, CGBEQU, CGBRFS, CGBTRF, CGBTRS, + $ CLACPY, CLAQGB, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = SLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KL.LT.0 ) THEN + INFO = -4 + ELSE IF( KU.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDAB.LT.KL+KU+1 ) THEN + INFO = -8 + ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN + INFO = -10 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -12 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -13 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -14 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -16 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -18 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGBSVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL CGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL CLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of the band matrix A. +* + DO 70 J = 1, N + J1 = MAX( J-KU, 1 ) + J2 = MIN( J+KL, N ) + CALL CCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1, + $ AFB( KL+KU+1-J+J1, J ), 1 ) + 70 CONTINUE +* + CALL CGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + ANORM = ZERO + DO 90 J = 1, INFO + DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 ) + ANORM = MAX( ANORM, ABS( AB( I, J ) ) ) + 80 CONTINUE + 90 CONTINUE + RPVGRW = CLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ), + $ AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB, + $ RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ANORM / RPVGRW + END IF + RWORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = CLANGB( NORM, N, KL, KU, AB, LDAB, RWORK ) + RPVGRW = CLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = CLANGB( 'M', N, KL, KU, AB, LDAB, RWORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL CGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND, + $ WORK, RWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL CLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL CGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX, + $ INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL CGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV, + $ B, LDB, X, LDX, FERR, BERR, WORK, RWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 120 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 140 J = 1, NRHS + DO 130 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 130 CONTINUE + 140 CONTINUE + DO 150 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 150 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + RWORK( 1 ) = RPVGRW + RETURN +* +* End of CGBSVX +* + END diff --git a/lapack-netlib/cgejsv.f b/lapack-netlib/cgejsv.f new file mode 100644 index 000000000..51a6cee4e --- /dev/null +++ b/lapack-netlib/cgejsv.f @@ -0,0 +1,2232 @@ +*> \brief \b CGEJSV +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGEJSV + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, +* M, N, A, LDA, SVA, U, LDU, V, LDV, +* CWORK, LWORK, RWORK, LRWORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* IMPLICIT NONE +* INTEGER INFO, LDA, LDU, LDV, LWORK, M, N +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( LWORK ) +* REAL SVA( N ), RWORK( LRWORK ) +* INTEGER IWORK( * ) +* CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGEJSV computes the singular value decomposition (SVD) of a complex M-by-N +*> matrix [A], where M >= N. The SVD of [A] is written as +*> +*> [A] = [U] * [SIGMA] * [V]^*, +*> +*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N +*> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and +*> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are +*> the singular values of [A]. The columns of [U] and [V] are the left and +*> the right singular vectors of [A], respectively. The matrices [U] and [V] +*> are computed and stored in the arrays U and V, respectively. The diagonal +*> of [SIGMA] is computed and stored in the array SVA. +*> \endverbatim +*> +*> Arguments: +*> ========== +*> +*> \param[in] JOBA +*> \verbatim +*> JOBA is CHARACTER*1 +*> Specifies the level of accuracy: +*> = 'C': This option works well (high relative accuracy) if A = B * D, +*> with well-conditioned B and arbitrary diagonal matrix D. +*> The accuracy cannot be spoiled by COLUMN scaling. The +*> accuracy of the computed output depends on the condition of +*> B, and the procedure aims at the best theoretical accuracy. +*> The relative error max_{i=1:N}|d sigma_i| / sigma_i is +*> bounded by f(M,N)*epsilon* cond(B), independent of D. +*> The input matrix is preprocessed with the QRF with column +*> pivoting. This initial preprocessing and preconditioning by +*> a rank revealing QR factorization is common for all values of +*> JOBA. Additional actions are specified as follows: +*> = 'E': Computation as with 'C' with an additional estimate of the +*> condition number of B. It provides a realistic error bound. +*> = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings +*> D1, D2, and well-conditioned matrix C, this option gives +*> higher accuracy than the 'C' option. If the structure of the +*> input matrix is not known, and relative accuracy is +*> desirable, then this option is advisable. The input matrix A +*> is preprocessed with QR factorization with FULL (row and +*> column) pivoting. +*> = 'G': Computation as with 'F' with an additional estimate of the +*> condition number of B, where A=B*D. If A has heavily weighted +*> rows, then using this condition number gives too pessimistic +*> error bound. +*> = 'A': Small singular values are not well determined by the data +*> and are considered as noisy; the matrix is treated as +*> numerically rank deficient. The error in the computed +*> singular values is bounded by f(m,n)*epsilon*||A||. +*> The computed SVD A = U * S * V^* restores A up to +*> f(m,n)*epsilon*||A||. +*> This gives the procedure the licence to discard (set to zero) +*> all singular values below N*epsilon*||A||. +*> = 'R': Similar as in 'A'. Rank revealing property of the initial +*> QR factorization is used do reveal (using triangular factor) +*> a gap sigma_{r+1} < epsilon * sigma_r in which case the +*> numerical RANK is declared to be r. The SVD is computed with +*> absolute error bounds, but more accurately than with 'A'. +*> \endverbatim +*> +*> \param[in] JOBU +*> \verbatim +*> JOBU is CHARACTER*1 +*> Specifies whether to compute the columns of U: +*> = 'U': N columns of U are returned in the array U. +*> = 'F': full set of M left sing. vectors is returned in the array U. +*> = 'W': U may be used as workspace of length M*N. See the description +*> of U. +*> = 'N': U is not computed. +*> \endverbatim +*> +*> \param[in] JOBV +*> \verbatim +*> JOBV is CHARACTER*1 +*> Specifies whether to compute the matrix V: +*> = 'V': N columns of V are returned in the array V; Jacobi rotations +*> are not explicitly accumulated. +*> = 'J': N columns of V are returned in the array V, but they are +*> computed as the product of Jacobi rotations, if JOBT = 'N'. +*> = 'W': V may be used as workspace of length N*N. See the description +*> of V. +*> = 'N': V is not computed. +*> \endverbatim +*> +*> \param[in] JOBR +*> \verbatim +*> JOBR is CHARACTER*1 +*> Specifies the RANGE for the singular values. Issues the licence to +*> set to zero small positive singular values if they are outside +*> specified range. If A .NE. 0 is scaled so that the largest singular +*> value of c*A is around SQRT(BIG), BIG=SLAMCH('O'), then JOBR issues +*> the licence to kill columns of A whose norm in c*A is less than +*> SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN, +*> where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E'). +*> = 'N': Do not kill small columns of c*A. This option assumes that +*> BLAS and QR factorizations and triangular solvers are +*> implemented to work in that range. If the condition of A +*> is greater than BIG, use CGESVJ. +*> = 'R': RESTRICTED range for sigma(c*A) is [SQRT(SFMIN), SQRT(BIG)] +*> (roughly, as described above). This option is recommended. +*> =========================== +*> For computing the singular values in the FULL range [SFMIN,BIG] +*> use CGESVJ. +*> \endverbatim +*> +*> \param[in] JOBT +*> \verbatim +*> JOBT is CHARACTER*1 +*> If the matrix is square then the procedure may determine to use +*> transposed A if A^* seems to be better with respect to convergence. +*> If the matrix is not square, JOBT is ignored. +*> The decision is based on two values of entropy over the adjoint +*> orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7). +*> = 'T': transpose if entropy test indicates possibly faster +*> convergence of Jacobi process if A^* is taken as input. If A is +*> replaced with A^*, then the row pivoting is included automatically. +*> = 'N': do not speculate. +*> The option 'T' can be used to compute only the singular values, or +*> the full SVD (U, SIGMA and V). For only one set of singular vectors +*> (U or V), the caller should provide both U and V, as one of the +*> matrices is used as workspace if the matrix A is transposed. +*> The implementer can easily remove this constraint and make the +*> code more complicated. See the descriptions of U and V. +*> In general, this option is considered experimental, and 'N'; should +*> be preferred. This is subject to changes in the future. +*> \endverbatim +*> +*> \param[in] JOBP +*> \verbatim +*> JOBP is CHARACTER*1 +*> Issues the licence to introduce structured perturbations to drown +*> denormalized numbers. This licence should be active if the +*> denormals are poorly implemented, causing slow computation, +*> especially in cases of fast convergence (!). For details see [1,2]. +*> For the sake of simplicity, this perturbations are included only +*> when the full SVD or only the singular values are requested. The +*> implementer/user can easily add the perturbation for the cases of +*> computing one set of singular vectors. +*> = 'P': introduce perturbation +*> = 'N': do not perturb +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the input matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the input matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] SVA +*> \verbatim +*> SVA is REAL array, dimension (N) +*> On exit, +*> - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During +*> the computation SVA contains Euclidean column norms of the +*> iterated matrices in the array A. +*> - For RWORK(1) .NE. RWORK(2): The singular values of A are +*> (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if +*> sigma_max(A) overflows or if small singular values have been +*> saved from underflow by scaling the input matrix A. +*> - If JOBR='R' then some of the singular values may be returned +*> as exact zeros obtained by "set to zero" because they are +*> below the numerical rank threshold or are denormalized numbers. +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is COMPLEX array, dimension ( LDU, N ) or ( LDU, M ) +*> If JOBU = 'U', then U contains on exit the M-by-N matrix of +*> the left singular vectors. +*> If JOBU = 'F', then U contains on exit the M-by-M matrix of +*> the left singular vectors, including an ONB +*> of the orthogonal complement of the Range(A). +*> If JOBU = 'W' .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N), +*> then U is used as workspace if the procedure +*> replaces A with A^*. In that case, [V] is computed +*> in U as left singular vectors of A^* and then +*> copied back to the V array. This 'W' option is just +*> a reminder to the caller that in this case U is +*> reserved as workspace of length N*N. +*> If JOBU = 'N' U is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is INTEGER +*> The leading dimension of the array U, LDU >= 1. +*> IF JOBU = 'U' or 'F' or 'W', then LDU >= M. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is COMPLEX array, dimension ( LDV, N ) +*> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of +*> the right singular vectors; +*> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), +*> then V is used as workspace if the pprocedure +*> replaces A with A^*. In that case, [U] is computed +*> in V as right singular vectors of A^* and then +*> copied back to the U array. This 'W' option is just +*> a reminder to the caller that in this case V is +*> reserved as workspace of length N*N. +*> If JOBV = 'N' V is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V, LDV >= 1. +*> If JOBV = 'V' or 'J' or 'W', then LDV >= N. +*> \endverbatim +*> +*> \param[out] CWORK +*> \verbatim +*> CWORK is COMPLEX array, dimension (MAX(2,LWORK)) +*> If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit CWORK(1) contains the required length of +*> CWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> Length of CWORK to confirm proper allocation of workspace. +*> LWORK depends on the job: +*> +*> 1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and +*> 1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'): +*> LWORK >= 2*N+1. This is the minimal requirement. +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= N + (N+1)*NB. Here NB is the optimal +*> block size for CGEQP3 and CGEQRF. +*> In general, optimal LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ)). +*> 1.2. .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). In this case, LWORK the minimal +*> requirement is LWORK >= N*N + 2*N. +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= max(N+(N+1)*NB, N*N+2*N)=N**2+2*N. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ), +*> N*N+LWORK(CPOCON)). +*> 2. If SIGMA and the right singular vectors are needed (JOBV = 'V'), +*> (JOBU = 'N') +*> 2.1 .. no scaled condition estimate requested (JOBE = 'N'): +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance, +*> LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF, +*> CUNMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3), N+LWORK(CGESVJ), +*> N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)). +*> 2.2 .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance, +*> LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF, +*> CUNMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3), LWORK(CPOCON), N+LWORK(CGESVJ), +*> N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)). +*> 3. If SIGMA and the left singular vectors are needed +*> 3.1 .. no scaled condition estimate requested (JOBE = 'N'): +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance: +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3), 2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)). +*> 3.2 .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance: +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CPOCON), +*> 2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)). +*> +*> 4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and +*> 4.1. if JOBV = 'V' +*> the minimal requirement is LWORK >= 5*N+2*N*N. +*> 4.2. if JOBV = 'J' the minimal requirement is +*> LWORK >= 4*N+N*N. +*> In both cases, the allocated CWORK can accommodate blocked runs +*> of CGEQP3, CGEQRF, CGELQF, CUNMQR, CUNMLQ. +*> +*> If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit CWORK(1) contains the optimal and CWORK(2) contains the +*> minimal length of CWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (MAX(7,LRWORK)) +*> On exit, +*> RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1) +*> such that SCALE*SVA(1:N) are the computed singular values +*> of A. (See the description of SVA().) +*> RWORK(2) = See the description of RWORK(1). +*> RWORK(3) = SCONDA is an estimate for the condition number of +*> column equilibrated A. (If JOBA = 'E' or 'G') +*> SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1). +*> It is computed using CPOCON. It holds +*> N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA +*> where R is the triangular factor from the QRF of A. +*> However, if R is truncated and the numerical rank is +*> determined to be strictly smaller than N, SCONDA is +*> returned as -1, thus indicating that the smallest +*> singular values might be lost. +*> +*> If full SVD is needed, the following two condition numbers are +*> useful for the analysis of the algorithm. They are provided for +*> a developer/implementer who is familiar with the details of +*> the method. +*> +*> RWORK(4) = an estimate of the scaled condition number of the +*> triangular factor in the first QR factorization. +*> RWORK(5) = an estimate of the scaled condition number of the +*> triangular factor in the second QR factorization. +*> The following two parameters are computed if JOBT = 'T'. +*> They are provided for a developer/implementer who is familiar +*> with the details of the method. +*> RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy +*> of diag(A^* * A) / Trace(A^* * A) taken as point in the +*> probability simplex. +*> RWORK(7) = the entropy of A * A^*. (See the description of RWORK(6).) +*> If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit RWORK(1) contains the required length of +*> RWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER +*> Length of RWORK to confirm proper allocation of workspace. +*> LRWORK depends on the job: +*> +*> 1. If only the singular values are requested i.e. if +*> LSAME(JOBU,'N') .AND. LSAME(JOBV,'N') +*> then: +*> 1.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then: LRWORK = max( 7, 2 * M ). +*> 1.2. Otherwise, LRWORK = max( 7, N ). +*> 2. If singular values with the right singular vectors are requested +*> i.e. if +*> (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) .AND. +*> .NOT.(LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) +*> then: +*> 2.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 2.2. Otherwise, LRWORK = max( 7, N ). +*> 3. If singular values with the left singular vectors are requested, i.e. if +*> (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND. +*> .NOT.(LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) +*> then: +*> 3.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 3.2. Otherwise, LRWORK = max( 7, N ). +*> 4. If singular values with both the left and the right singular vectors +*> are requested, i.e. if +*> (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND. +*> (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) +*> then: +*> 4.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 4.2. Otherwise, LRWORK = max( 7, N ). +*> +*> If, on entry, LRWORK = -1 or LWORK=-1, a workspace query is assumed and +*> the length of RWORK is returned in RWORK(1). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, of dimension at least 4, that further depends +*> on the job: +*> +*> 1. If only the singular values are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 2. If the singular values and the right singular vectors are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 3. If the singular values and the left singular vectors are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 4. If the singular values with both the left and the right singular vectors +*> are requested, then: +*> 4.1. If LSAME(JOBV,'J') the length of IWORK is determined as follows: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 4.2. If LSAME(JOBV,'V') the length of IWORK is determined as follows: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is 2*N+M; otherwise the length of IWORK is 2*N. +*> +*> On exit, +*> IWORK(1) = the numerical rank determined after the initial +*> QR factorization with pivoting. See the descriptions +*> of JOBA and JOBR. +*> IWORK(2) = the number of the computed nonzero singular values +*> IWORK(3) = if nonzero, a warning message: +*> If IWORK(3) = 1 then some of the column norms of A +*> were denormalized floats. The requested high accuracy +*> is not warranted by the data. +*> IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to +*> do the job as specified by the JOB parameters. +*> If the call to CGEJSV is a workspace query (indicated by LWORK = -1 and +*> LRWORK = -1), then on exit IWORK(1) contains the required length of +*> IWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> < 0: if INFO = -i, then the i-th argument had an illegal value. +*> = 0: successful exit; +*> > 0: CGEJSV did not converge in the maximal allowed number +*> of sweeps. The computed values may be inaccurate. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexGEsing +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> CGEJSV implements a preconditioned Jacobi SVD algorithm. It uses CGEQP3, +*> CGEQRF, and CGELQF as preprocessors and preconditioners. Optionally, an +*> additional row pivoting can be used as a preprocessor, which in some +*> cases results in much higher accuracy. An example is matrix A with the +*> structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned +*> diagonal matrices and C is well-conditioned matrix. In that case, complete +*> pivoting in the first QR factorizations provides accuracy dependent on the +*> condition number of C, and independent of D1, D2. Such higher accuracy is +*> not completely understood theoretically, but it works well in practice. +*> Further, if A can be written as A = B*D, with well-conditioned B and some +*> diagonal D, then the high accuracy is guaranteed, both theoretically and +*> in software, independent of D. For more details see [1], [2]. +*> The computational range for the singular values can be the full range +*> ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS +*> & LAPACK routines called by CGEJSV are implemented to work in that range. +*> If that is not the case, then the restriction for safe computation with +*> the singular values in the range of normalized IEEE numbers is that the +*> spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not +*> overflow. This code (CGEJSV) is best used in this restricted range, +*> meaning that singular values of magnitude below ||A||_2 / SLAMCH('O') are +*> returned as zeros. See JOBR for details on this. +*> Further, this implementation is somewhat slower than the one described +*> in [1,2] due to replacement of some non-LAPACK components, and because +*> the choice of some tuning parameters in the iterative part (CGESVJ) is +*> left to the implementer on a particular machine. +*> The rank revealing QR factorization (in this code: CGEQP3) should be +*> implemented as in [3]. We have a new version of CGEQP3 under development +*> that is more robust than the current one in LAPACK, with a cleaner cut in +*> rank deficient cases. It will be available in the SIGMA library [4]. +*> If M is much larger than N, it is obvious that the initial QRF with +*> column pivoting can be preprocessed by the QRF without pivoting. That +*> well known trick is not used in CGEJSV because in some cases heavy row +*> weighting can be treated with complete pivoting. The overhead in cases +*> M much larger than N is then only due to pivoting, but the benefits in +*> terms of accuracy have prevailed. The implementer/user can incorporate +*> this extra QRF step easily. The implementer can also improve data movement +*> (matrix transpose, matrix copy, matrix transposed copy) - this +*> implementation of CGEJSV uses only the simplest, naive data movement. +*> \endverbatim +* +*> \par Contributor: +* ================== +*> +*> Zlatko Drmac (Zagreb, Croatia) +* +*> \par References: +* ================ +*> +*> \verbatim +*> +*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342. +*> LAPACK Working note 169. +*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362. +*> LAPACK Working note 170. +*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR +*> factorization software - a case study. +*> ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28. +*> LAPACK Working note 176. +*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV, +*> QSVD, (H,K)-SVD computations. +*> Department of Mathematics, University of Zagreb, 2008, 2016. +*> \endverbatim +* +*> \par Bugs, examples and comments: +* ================================= +*> +*> Please report all bugs and send interesting examples and/or comments to +*> drmac@math.hr. Thank you. +*> +* ===================================================================== + SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, + $ M, N, A, LDA, SVA, U, LDU, V, LDV, + $ CWORK, LWORK, RWORK, LRWORK, IWORK, INFO ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + IMPLICIT NONE + INTEGER INFO, LDA, LDU, LDV, LWORK, LRWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( LWORK ) + REAL SVA( N ), RWORK( LRWORK ) + INTEGER IWORK( * ) + CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* =========================================================================== +* +* .. Local Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ), CONE = ( 1.0E0, 0.0E0 ) ) +* .. +* .. Local Scalars .. + COMPLEX CTEMP + REAL AAPP, AAQQ, AATMAX, AATMIN, BIG, BIG1, COND_OK, + $ CONDR1, CONDR2, ENTRA, ENTRAT, EPSLN, MAXPRJ, SCALEM, + $ SCONDA, SFMIN, SMALL, TEMP1, USCAL1, USCAL2, XSC + INTEGER IERR, N1, NR, NUMRANK, p, q, WARNING + LOGICAL ALMORT, DEFR, ERREST, GOSCAL, JRACC, KILL, LQUERY, + $ LSVEC, L2ABER, L2KILL, L2PERT, L2RANK, L2TRAN, NOSCAL, + $ ROWPIV, RSVEC, TRANSP +* + INTEGER OPTWRK, MINWRK, MINRWRK, MINIWRK + INTEGER LWCON, LWLQF, LWQP3, LWQRF, LWUNMLQ, LWUNMQR, LWUNMQRM, + $ LWSVDJ, LWSVDJV, LRWQP3, LRWCON, LRWSVDJ, IWOFF + INTEGER LWRK_CGELQF, LWRK_CGEQP3, LWRK_CGEQP3N, LWRK_CGEQRF, + $ LWRK_CGESVJ, LWRK_CGESVJV, LWRK_CGESVJU, LWRK_CUNMLQ, + $ LWRK_CUNMQR, LWRK_CUNMQRM +* .. +* .. Local Arrays + COMPLEX CDUMMY(1) + REAL RDUMMY(1) +* +* .. Intrinsic Functions .. + INTRINSIC ABS, CMPLX, CONJG, ALOG, MAX, MIN, REAL, NINT, SQRT +* .. +* .. External Functions .. + REAL SLAMCH, SCNRM2 + INTEGER ISAMAX, ICAMAX + LOGICAL LSAME + EXTERNAL ISAMAX, ICAMAX, LSAME, SLAMCH, SCNRM2 +* .. +* .. External Subroutines .. + EXTERNAL SLASSQ, CCOPY, CGELQF, CGEQP3, CGEQRF, CLACPY, CLAPMR, + $ CLASCL, SLASCL, CLASET, CLASSQ, CLASWP, CUNGQR, CUNMLQ, + $ CUNMQR, CPOCON, SSCAL, CSSCAL, CSWAP, CTRSM, CLACGV, + $ XERBLA +* + EXTERNAL CGESVJ +* .. +* +* Test the input arguments +* + LSVEC = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' ) + JRACC = LSAME( JOBV, 'J' ) + RSVEC = LSAME( JOBV, 'V' ) .OR. JRACC + ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' ) + L2RANK = LSAME( JOBA, 'R' ) + L2ABER = LSAME( JOBA, 'A' ) + ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' ) + L2TRAN = LSAME( JOBT, 'T' ) .AND. ( M .EQ. N ) + L2KILL = LSAME( JOBR, 'R' ) + DEFR = LSAME( JOBR, 'N' ) + L2PERT = LSAME( JOBP, 'P' ) +* + LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) +* + IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR. + $ ERREST .OR. LSAME( JOBA, 'C' ) )) THEN + INFO = - 1 + ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR. + $ ( LSAME( JOBU, 'W' ) .AND. RSVEC .AND. L2TRAN ) ) ) THEN + INFO = - 2 + ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR. + $ ( LSAME( JOBV, 'W' ) .AND. LSVEC .AND. L2TRAN ) ) ) THEN + INFO = - 3 + ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) ) THEN + INFO = - 4 + ELSE IF ( .NOT. ( LSAME(JOBT,'T') .OR. LSAME(JOBT,'N') ) ) THEN + INFO = - 5 + ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN + INFO = - 6 + ELSE IF ( M .LT. 0 ) THEN + INFO = - 7 + ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN + INFO = - 8 + ELSE IF ( LDA .LT. M ) THEN + INFO = - 10 + ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN + INFO = - 13 + ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN + INFO = - 15 + ELSE +* #:) + INFO = 0 + END IF +* + IF ( INFO .EQ. 0 ) THEN +* .. compute the minimal and the optimal workspace lengths +* [[The expressions for computing the minimal and the optimal +* values of LCWORK, LRWORK are written with a lot of redundancy and +* can be simplified. However, this verbose form is useful for +* maintenance and modifications of the code.]] +* +* .. minimal workspace length for CGEQP3 of an M x N matrix, +* CGEQRF of an N x N matrix, CGELQF of an N x N matrix, +* CUNMLQ for computing N x N matrix, CUNMQR for computing N x N +* matrix, CUNMQR for computing M x N matrix, respectively. + LWQP3 = N+1 + LWQRF = MAX( 1, N ) + LWLQF = MAX( 1, N ) + LWUNMLQ = MAX( 1, N ) + LWUNMQR = MAX( 1, N ) + LWUNMQRM = MAX( 1, M ) +* .. minimal workspace length for CPOCON of an N x N matrix + LWCON = 2 * N +* .. minimal workspace length for CGESVJ of an N x N matrix, +* without and with explicit accumulation of Jacobi rotations + LWSVDJ = MAX( 2 * N, 1 ) + LWSVDJV = MAX( 2 * N, 1 ) +* .. minimal REAL workspace length for CGEQP3, CPOCON, CGESVJ + LRWQP3 = 2 * N + LRWCON = N + LRWSVDJ = N + IF ( LQUERY ) THEN + CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, + $ RDUMMY, IERR ) + LWRK_CGEQP3 = INT( CDUMMY(1) ) + CALL CGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) + LWRK_CGEQRF = INT( CDUMMY(1) ) + CALL CGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) + LWRK_CGELQF = INT( CDUMMY(1) ) + END IF + MINWRK = 2 + OPTWRK = 2 + MINIWRK = N + IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN +* .. minimal and optimal sizes of the complex workspace if +* only the singular values are requested + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N**2+LWCON, N+LWQRF, LWSVDJ ) + ELSE + MINWRK = MAX( N+LWQP3, N+LWQRF, LWSVDJ ) + END IF + IF ( LQUERY ) THEN + CALL CGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, + $ LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_CGEQP3, N**2+LWCON, + $ N+LWRK_CGEQRF, LWRK_CGESVJ ) + ELSE + OPTWRK = MAX( N+LWRK_CGEQP3, N+LWRK_CGEQRF, + $ LWRK_CGESVJ ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWCON, LRWSVDJ ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWCON, LRWSVDJ ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the right singular vectors are requested + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, LWCON, LWSVDJ, N+LWLQF, + $ 2*N+LWQRF, N+LWSVDJ, N+LWUNMLQ ) + ELSE + MINWRK = MAX( N+LWQP3, LWSVDJ, N+LWLQF, 2*N+LWQRF, + $ N+LWSVDJ, N+LWUNMLQ ) + END IF + IF ( LQUERY ) THEN + CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, + $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJ = INT( CDUMMY(1) ) + CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_CGEQP3, LWCON, LWRK_CGESVJ, + $ N+LWRK_CGELQF, 2*N+LWRK_CGEQRF, + $ N+LWRK_CGESVJ, N+LWRK_CUNMLQ ) + ELSE + OPTWRK = MAX( N+LWRK_CGEQP3, LWRK_CGESVJ,N+LWRK_CGELQF, + $ 2*N+LWRK_CGEQRF, N+LWRK_CGESVJ, + $ N+LWRK_CUNMLQ ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the left singular vectors are requested + IF ( ERREST ) THEN + MINWRK = N + MAX( LWQP3,LWCON,N+LWQRF,LWSVDJ,LWUNMQRM ) + ELSE + MINWRK = N + MAX( LWQP3, N+LWQRF, LWSVDJ, LWUNMQRM ) + END IF + IF ( LQUERY ) THEN + CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, + $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJ = INT( CDUMMY(1) ) + CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, N+LWRK_CGEQRF, + $ LWRK_CGESVJ, LWRK_CUNMQRM ) + ELSE + OPTWRK = N + MAX( LWRK_CGEQP3, N+LWRK_CGEQRF, + $ LWRK_CGESVJ, LWRK_CUNMQRM ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE +* .. minimal and optimal sizes of the complex workspace if the +* full SVD is requested + IF ( .NOT. JRACC ) THEN + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+N**2+LWCON, + $ 2*N+LWQRF, 2*N+LWQP3, + $ 2*N+N**2+N+LWLQF, 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, + $ 2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, + $ N+N**2+LWSVDJ, N+LWUNMQRM ) + ELSE + MINWRK = MAX( N+LWQP3, 2*N+N**2+LWCON, + $ 2*N+LWQRF, 2*N+LWQP3, + $ 2*N+N**2+N+LWLQF, 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, + $ 2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, + $ N+N**2+LWSVDJ, N+LWUNMQRM ) + END IF + MINIWRK = MINIWRK + N + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+LWQRF, + $ 2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, + $ N+LWUNMQRM ) + ELSE + MINWRK = MAX( N+LWQP3, 2*N+LWQRF, + $ 2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, + $ N+LWUNMQRM ) + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + END IF + IF ( LQUERY ) THEN + CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) + CALL CUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_CUNMQR = INT( CDUMMY(1) ) + IF ( .NOT. JRACC ) THEN + CALL CGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1, + $ RDUMMY, IERR ) + LWRK_CGEQP3N = INT( CDUMMY(1) ) + CALL CGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJ = INT( CDUMMY(1) ) + CALL CGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJU = INT( CDUMMY(1) ) + CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJV = INT( CDUMMY(1) ) + CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, + $ 2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, + $ 2*N+LWRK_CGEQP3N, + $ 2*N+N**2+N+LWRK_CGELQF, + $ 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWRK_CGESVJ, + $ 2*N+N**2+N+LWRK_CGESVJV, + $ 2*N+N**2+N+LWRK_CUNMQR, + $ 2*N+N**2+N+LWRK_CUNMLQ, + $ N+N**2+LWRK_CGESVJU, + $ N+LWRK_CUNMQRM ) + ELSE + OPTWRK = MAX( N+LWRK_CGEQP3, + $ 2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, + $ 2*N+LWRK_CGEQP3N, + $ 2*N+N**2+N+LWRK_CGELQF, + $ 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWRK_CGESVJ, + $ 2*N+N**2+N+LWRK_CGESVJV, + $ 2*N+N**2+N+LWRK_CUNMQR, + $ 2*N+N**2+N+LWRK_CUNMLQ, + $ N+N**2+LWRK_CGESVJU, + $ N+LWRK_CUNMQRM ) + END IF + ELSE + CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_CGESVJV = INT( CDUMMY(1) ) + CALL CUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_CUNMQR = INT( CDUMMY(1) ) + CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, + $ 2*N+LWRK_CGEQRF, 2*N+N**2, + $ 2*N+N**2+LWRK_CGESVJV, + $ 2*N+N**2+N+LWRK_CUNMQR,N+LWRK_CUNMQRM ) + ELSE + OPTWRK = MAX( N+LWRK_CGEQP3, 2*N+LWRK_CGEQRF, + $ 2*N+N**2, 2*N+N**2+LWRK_CGESVJV, + $ 2*N+N**2+N+LWRK_CUNMQR, + $ N+LWRK_CUNMQRM ) + END IF + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + END IF + END IF + MINWRK = MAX( 2, MINWRK ) + OPTWRK = MAX( OPTWRK, MINWRK ) + IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = - 17 + IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19 + END IF +* + IF ( INFO .NE. 0 ) THEN +* #:( + CALL XERBLA( 'CGEJSV', - INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + CWORK(1) = OPTWRK + CWORK(2) = MINWRK + RWORK(1) = MINRWRK + IWORK(1) = MAX( 4, MINIWRK ) + RETURN + END IF +* +* Quick return for void matrix (Y3K safe) +* #:) + IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN + IWORK(1:4) = 0 + RWORK(1:7) = 0 + RETURN + ENDIF +* +* Determine whether the matrix U should be M x N or M x M +* + IF ( LSVEC ) THEN + N1 = N + IF ( LSAME( JOBU, 'F' ) ) N1 = M + END IF +* +* Set numerical parameters +* +*! NOTE: Make sure SLAMCH() does not fail on the target architecture. +* + EPSLN = SLAMCH('Epsilon') + SFMIN = SLAMCH('SafeMinimum') + SMALL = SFMIN / EPSLN + BIG = SLAMCH('O') +* BIG = ONE / SFMIN +* +* Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N +* +*(!) If necessary, scale SVA() to protect the largest norm from +* overflow. It is possible that this scaling pushes the smallest +* column norm left from the underflow threshold (extreme case). +* + SCALEM = ONE / SQRT(REAL(M)*REAL(N)) + NOSCAL = .TRUE. + GOSCAL = .TRUE. + DO 1874 p = 1, N + AAPP = ZERO + AAQQ = ONE + CALL CLASSQ( M, A(1,p), 1, AAPP, AAQQ ) + IF ( AAPP .GT. BIG ) THEN + INFO = - 9 + CALL XERBLA( 'CGEJSV', -INFO ) + RETURN + END IF + AAQQ = SQRT(AAQQ) + IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL ) THEN + SVA(p) = AAPP * AAQQ + ELSE + NOSCAL = .FALSE. + SVA(p) = AAPP * ( AAQQ * SCALEM ) + IF ( GOSCAL ) THEN + GOSCAL = .FALSE. + CALL SSCAL( p-1, SCALEM, SVA, 1 ) + END IF + END IF + 1874 CONTINUE +* + IF ( NOSCAL ) SCALEM = ONE +* + AAPP = ZERO + AAQQ = BIG + DO 4781 p = 1, N + AAPP = MAX( AAPP, SVA(p) ) + IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) ) + 4781 CONTINUE +* +* Quick return for zero M x N matrix +* #:) + IF ( AAPP .EQ. ZERO ) THEN + IF ( LSVEC ) CALL CLASET( 'G', M, N1, CZERO, CONE, U, LDU ) + IF ( RSVEC ) CALL CLASET( 'G', N, N, CZERO, CONE, V, LDV ) + RWORK(1) = ONE + RWORK(2) = ONE + IF ( ERREST ) RWORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = ONE + RWORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ZERO + RWORK(7) = ZERO + END IF + IWORK(1) = 0 + IWORK(2) = 0 + IWORK(3) = 0 + IWORK(4) = -1 + RETURN + END IF +* +* Issue warning if denormalized column norms detected. Override the +* high relative accuracy request. Issue licence to kill nonzero columns +* (set them to zero) whose norm is less than sigma_max / BIG (roughly). +* #:( + WARNING = 0 + IF ( AAQQ .LE. SFMIN ) THEN + L2RANK = .TRUE. + L2KILL = .TRUE. + WARNING = 1 + END IF +* +* Quick return for one-column matrix +* #:) + IF ( N .EQ. 1 ) THEN +* + IF ( LSVEC ) THEN + CALL CLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR ) + CALL CLACPY( 'A', M, 1, A, LDA, U, LDU ) +* computing all M left singular vectors of the M x 1 matrix + IF ( N1 .NE. N ) THEN + CALL CGEQRF( M, N, U,LDU, CWORK, CWORK(N+1),LWORK-N,IERR ) + CALL CUNGQR( M,N1,1, U,LDU,CWORK,CWORK(N+1),LWORK-N,IERR ) + CALL CCOPY( M, A(1,1), 1, U(1,1), 1 ) + END IF + END IF + IF ( RSVEC ) THEN + V(1,1) = CONE + END IF + IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN + SVA(1) = SVA(1) / SCALEM + SCALEM = ONE + END IF + RWORK(1) = ONE / SCALEM + RWORK(2) = ONE + IF ( SVA(1) .NE. ZERO ) THEN + IWORK(1) = 1 + IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN + IWORK(2) = 1 + ELSE + IWORK(2) = 0 + END IF + ELSE + IWORK(1) = 0 + IWORK(2) = 0 + END IF + IWORK(3) = 0 + IWORK(4) = -1 + IF ( ERREST ) RWORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = ONE + RWORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ZERO + RWORK(7) = ZERO + END IF + RETURN +* + END IF +* + TRANSP = .FALSE. +* + AATMAX = -ONE + AATMIN = BIG + IF ( ROWPIV .OR. L2TRAN ) THEN +* +* Compute the row norms, needed to determine row pivoting sequence +* (in the case of heavily row weighted A, row pivoting is strongly +* advised) and to collect information needed to compare the +* structures of A * A^* and A^* * A (in the case L2TRAN.EQ..TRUE.). +* + IF ( L2TRAN ) THEN + DO 1950 p = 1, M + XSC = ZERO + TEMP1 = ONE + CALL CLASSQ( N, A(p,1), LDA, XSC, TEMP1 ) +* CLASSQ gets both the ell_2 and the ell_infinity norm +* in one pass through the vector + RWORK(M+p) = XSC * SCALEM + RWORK(p) = XSC * (SCALEM*SQRT(TEMP1)) + AATMAX = MAX( AATMAX, RWORK(p) ) + IF (RWORK(p) .NE. ZERO) + $ AATMIN = MIN(AATMIN,RWORK(p)) + 1950 CONTINUE + ELSE + DO 1904 p = 1, M + RWORK(M+p) = SCALEM*ABS( A(p,ICAMAX(N,A(p,1),LDA)) ) + AATMAX = MAX( AATMAX, RWORK(M+p) ) + AATMIN = MIN( AATMIN, RWORK(M+p) ) + 1904 CONTINUE + END IF +* + END IF +* +* For square matrix A try to determine whether A^* would be better +* input for the preconditioned Jacobi SVD, with faster convergence. +* The decision is based on an O(N) function of the vector of column +* and row norms of A, based on the Shannon entropy. This should give +* the right choice in most cases when the difference actually matters. +* It may fail and pick the slower converging side. +* + ENTRA = ZERO + ENTRAT = ZERO + IF ( L2TRAN ) THEN +* + XSC = ZERO + TEMP1 = ONE + CALL SLASSQ( N, SVA, 1, XSC, TEMP1 ) + TEMP1 = ONE / TEMP1 +* + ENTRA = ZERO + DO 1113 p = 1, N + BIG1 = ( ( SVA(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * ALOG(BIG1) + 1113 CONTINUE + ENTRA = - ENTRA / ALOG(REAL(N)) +* +* Now, SVA().^2/Trace(A^* * A) is a point in the probability simplex. +* It is derived from the diagonal of A^* * A. Do the same with the +* diagonal of A * A^*, compute the entropy of the corresponding +* probability distribution. Note that A * A^* and A^* * A have the +* same trace. +* + ENTRAT = ZERO + DO 1114 p = 1, M + BIG1 = ( ( RWORK(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * ALOG(BIG1) + 1114 CONTINUE + ENTRAT = - ENTRAT / ALOG(REAL(M)) +* +* Analyze the entropies and decide A or A^*. Smaller entropy +* usually means better input for the algorithm. +* + TRANSP = ( ENTRAT .LT. ENTRA ) +* +* If A^* is better than A, take the adjoint of A. This is allowed +* only for square matrices, M=N. + IF ( TRANSP ) THEN +* In an optimal implementation, this trivial transpose +* should be replaced with faster transpose. + DO 1115 p = 1, N - 1 + A(p,p) = CONJG(A(p,p)) + DO 1116 q = p + 1, N + CTEMP = CONJG(A(q,p)) + A(q,p) = CONJG(A(p,q)) + A(p,q) = CTEMP + 1116 CONTINUE + 1115 CONTINUE + A(N,N) = CONJG(A(N,N)) + DO 1117 p = 1, N + RWORK(M+p) = SVA(p) + SVA(p) = RWORK(p) +* previously computed row 2-norms are now column 2-norms +* of the transposed matrix + 1117 CONTINUE + TEMP1 = AAPP + AAPP = AATMAX + AATMAX = TEMP1 + TEMP1 = AAQQ + AAQQ = AATMIN + AATMIN = TEMP1 + KILL = LSVEC + LSVEC = RSVEC + RSVEC = KILL + IF ( LSVEC ) N1 = N +* + ROWPIV = .TRUE. + END IF +* + END IF +* END IF L2TRAN +* +* Scale the matrix so that its maximal singular value remains less +* than SQRT(BIG) -- the matrix is scaled so that its maximal column +* has Euclidean norm equal to SQRT(BIG/N). The only reason to keep +* SQRT(BIG) instead of BIG is the fact that CGEJSV uses LAPACK and +* BLAS routines that, in some implementations, are not capable of +* working in the full interval [SFMIN,BIG] and that they may provoke +* overflows in the intermediate results. If the singular values spread +* from SFMIN to BIG, then CGESVJ will compute them. So, in that case, +* one should use CGESVJ instead of CGEJSV. + BIG1 = SQRT( BIG ) + TEMP1 = SQRT( BIG / REAL(N) ) +* >> for future updates: allow bigger range, i.e. the largest column +* will be allowed up to BIG/N and CGESVJ will do the rest. However, for +* this all other (LAPACK) components must allow such a range. +* TEMP1 = BIG/REAL(N) +* TEMP1 = BIG * EPSLN this should 'almost' work with current LAPACK components + CALL SLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR ) + IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN + AAQQ = ( AAQQ / AAPP ) * TEMP1 + ELSE + AAQQ = ( AAQQ * TEMP1 ) / AAPP + END IF + TEMP1 = TEMP1 * SCALEM + CALL CLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR ) +* +* To undo scaling at the end of this procedure, multiply the +* computed singular values with USCAL2 / USCAL1. +* + USCAL1 = TEMP1 + USCAL2 = AAPP +* + IF ( L2KILL ) THEN +* L2KILL enforces computation of nonzero singular values in +* the restricted range of condition number of the initial A, +* sigma_max(A) / sigma_min(A) approx. SQRT(BIG)/SQRT(SFMIN). + XSC = SQRT( SFMIN ) + ELSE + XSC = SMALL +* +* Now, if the condition number of A is too big, +* sigma_max(A) / sigma_min(A) .GT. SQRT(BIG/N) * EPSLN / SFMIN, +* as a precaution measure, the full SVD is computed using CGESVJ +* with accumulated Jacobi rotations. This provides numerically +* more robust computation, at the cost of slightly increased run +* time. Depending on the concrete implementation of BLAS and LAPACK +* (i.e. how they behave in presence of extreme ill-conditioning) the +* implementor may decide to remove this switch. + IF ( ( AAQQ.LT.SQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN + JRACC = .TRUE. + END IF +* + END IF + IF ( AAQQ .LT. XSC ) THEN + DO 700 p = 1, N + IF ( SVA(p) .LT. XSC ) THEN + CALL CLASET( 'A', M, 1, CZERO, CZERO, A(1,p), LDA ) + SVA(p) = ZERO + END IF + 700 CONTINUE + END IF +* +* Preconditioning using QR factorization with pivoting +* + IF ( ROWPIV ) THEN +* Optional row permutation (Bjoerck row pivoting): +* A result by Cox and Higham shows that the Bjoerck's +* row pivoting combined with standard column pivoting +* has similar effect as Powell-Reid complete pivoting. +* The ell-infinity norms of A are made nonincreasing. + IF ( ( LSVEC .AND. RSVEC ) .AND. .NOT.( JRACC ) ) THEN + IWOFF = 2*N + ELSE + IWOFF = N + END IF + DO 1952 p = 1, M - 1 + q = ISAMAX( M-p+1, RWORK(M+p), 1 ) + p - 1 + IWORK(IWOFF+p) = q + IF ( p .NE. q ) THEN + TEMP1 = RWORK(M+p) + RWORK(M+p) = RWORK(M+q) + RWORK(M+q) = TEMP1 + END IF + 1952 CONTINUE + CALL CLASWP( N, A, LDA, 1, M-1, IWORK(IWOFF+1), 1 ) + END IF +* +* End of the preparation phase (scaling, optional sorting and +* transposing, optional flushing of small columns). +* +* Preconditioning +* +* If the full SVD is needed, the right singular vectors are computed +* from a matrix equation, and for that we need theoretical analysis +* of the Businger-Golub pivoting. So we use CGEQP3 as the first RR QRF. +* In all other cases the first RR QRF can be chosen by other criteria +* (eg speed by replacing global with restricted window pivoting, such +* as in xGEQPX from TOMS # 782). Good results will be obtained using +* xGEQPX with properly (!) chosen numerical parameters. +* Any improvement of CGEQP3 improves overall performance of CGEJSV. +* +* A * P1 = Q1 * [ R1^* 0]^*: + DO 1963 p = 1, N +* .. all columns are free columns + IWORK(p) = 0 + 1963 CONTINUE + CALL CGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LWORK-N, + $ RWORK, IERR ) +* +* The upper triangular matrix R1 from the first QRF is inspected for +* rank deficiency and possibilities for deflation, or possible +* ill-conditioning. Depending on the user specified flag L2RANK, +* the procedure explores possibilities to reduce the numerical +* rank by inspecting the computed upper triangular factor. If +* L2RANK or L2ABER are up, then CGEJSV will compute the SVD of +* A + dA, where ||dA|| <= f(M,N)*EPSLN. +* + NR = 1 + IF ( L2ABER ) THEN +* Standard absolute error bound suffices. All sigma_i with +* sigma_i < N*EPSLN*||A|| are flushed to zero. This is an +* aggressive enforcement of lower numerical rank by introducing a +* backward error of the order of N*EPSLN*||A||. + TEMP1 = SQRT(REAL(N))*EPSLN + DO 3001 p = 2, N + IF ( ABS(A(p,p)) .GE. (TEMP1*ABS(A(1,1))) ) THEN + NR = NR + 1 + ELSE + GO TO 3002 + END IF + 3001 CONTINUE + 3002 CONTINUE + ELSE IF ( L2RANK ) THEN +* .. similarly as above, only slightly more gentle (less aggressive). +* Sudden drop on the diagonal of R1 is used as the criterion for +* close-to-rank-deficient. + TEMP1 = SQRT(SFMIN) + DO 3401 p = 2, N + IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR. + $ ( ABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402 + NR = NR + 1 + 3401 CONTINUE + 3402 CONTINUE +* + ELSE +* The goal is high relative accuracy. However, if the matrix +* has high scaled condition number the relative accuracy is in +* general not feasible. Later on, a condition number estimator +* will be deployed to estimate the scaled condition number. +* Here we just remove the underflowed part of the triangular +* factor. This prevents the situation in which the code is +* working hard to get the accuracy not warranted by the data. + TEMP1 = SQRT(SFMIN) + DO 3301 p = 2, N + IF ( ( ABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302 + NR = NR + 1 + 3301 CONTINUE + 3302 CONTINUE +* + END IF +* + ALMORT = .FALSE. + IF ( NR .EQ. N ) THEN + MAXPRJ = ONE + DO 3051 p = 2, N + TEMP1 = ABS(A(p,p)) / SVA(IWORK(p)) + MAXPRJ = MIN( MAXPRJ, TEMP1 ) + 3051 CONTINUE + IF ( MAXPRJ**2 .GE. ONE - REAL(N)*EPSLN ) ALMORT = .TRUE. + END IF +* +* + SCONDA = - ONE + CONDR1 = - ONE + CONDR2 = - ONE +* + IF ( ERREST ) THEN + IF ( N .EQ. NR ) THEN + IF ( RSVEC ) THEN +* .. V is available as workspace + CALL CLACPY( 'U', N, N, A, LDA, V, LDV ) + DO 3053 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL CSSCAL( p, ONE/TEMP1, V(1,p), 1 ) + 3053 CONTINUE + IF ( LSVEC )THEN + CALL CPOCON( 'U', N, V, LDV, ONE, TEMP1, + $ CWORK(N+1), RWORK, IERR ) + ELSE + CALL CPOCON( 'U', N, V, LDV, ONE, TEMP1, + $ CWORK, RWORK, IERR ) + END IF +* + ELSE IF ( LSVEC ) THEN +* .. U is available as workspace + CALL CLACPY( 'U', N, N, A, LDA, U, LDU ) + DO 3054 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL CSSCAL( p, ONE/TEMP1, U(1,p), 1 ) + 3054 CONTINUE + CALL CPOCON( 'U', N, U, LDU, ONE, TEMP1, + $ CWORK(N+1), RWORK, IERR ) + ELSE + CALL CLACPY( 'U', N, N, A, LDA, CWORK, N ) +*[] CALL CLACPY( 'U', N, N, A, LDA, CWORK(N+1), N ) +* Change: here index shifted by N to the left, CWORK(1:N) +* not needed for SIGMA only computation + DO 3052 p = 1, N + TEMP1 = SVA(IWORK(p)) +*[] CALL CSSCAL( p, ONE/TEMP1, CWORK(N+(p-1)*N+1), 1 ) + CALL CSSCAL( p, ONE/TEMP1, CWORK((p-1)*N+1), 1 ) + 3052 CONTINUE +* .. the columns of R are scaled to have unit Euclidean lengths. +*[] CALL CPOCON( 'U', N, CWORK(N+1), N, ONE, TEMP1, +*[] $ CWORK(N+N*N+1), RWORK, IERR ) + CALL CPOCON( 'U', N, CWORK, N, ONE, TEMP1, + $ CWORK(N*N+1), RWORK, IERR ) +* + END IF + IF ( TEMP1 .NE. ZERO ) THEN + SCONDA = ONE / SQRT(TEMP1) + ELSE + SCONDA = - ONE + END IF +* SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1). +* N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA + ELSE + SCONDA = - ONE + END IF + END IF +* + L2PERT = L2PERT .AND. ( ABS( A(1,1)/A(NR,NR) ) .GT. SQRT(BIG1) ) +* If there is no violent scaling, artificial perturbation is not needed. +* +* Phase 3: +* + IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN +* +* Singular Values only +* +* .. transpose A(1:NR,1:N) + DO 1946 p = 1, MIN( N-1, NR ) + CALL CCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 ) + CALL CLACGV( N-p+1, A(p,p), 1 ) + 1946 CONTINUE + IF ( NR .EQ. N ) A(N,N) = CONJG(A(N,N)) +* +* The following two DO-loops introduce small relative perturbation +* into the strict upper triangle of the lower triangular matrix. +* Small entries below the main diagonal are also changed. +* This modification is useful if the computing environment does not +* provide/allow FLUSH TO ZERO underflow, for it prevents many +* annoying denormalized numbers in case of strongly scaled matrices. +* The perturbation is structured so that it does not introduce any +* new perturbation of the singular values, and it does not destroy +* the job done by the preconditioner. +* The licence for this perturbation is in the variable L2PERT, which +* should be .FALSE. if FLUSH TO ZERO underflow is active. +* + IF ( .NOT. ALMORT ) THEN +* + IF ( L2PERT ) THEN +* XSC = SQRT(SMALL) + XSC = EPSLN / REAL(N) + DO 4947 q = 1, NR + CTEMP = CMPLX(XSC*ABS(A(q,q)),ZERO) + DO 4949 p = 1, N + IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) +* $ A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) ) + $ A(p,q) = CTEMP + 4949 CONTINUE + 4947 CONTINUE + ELSE + CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, A(1,2),LDA ) + END IF +* +* .. second preconditioning using the QR factorization +* + CALL CGEQRF( N,NR, A,LDA, CWORK, CWORK(N+1),LWORK-N, IERR ) +* +* .. and transpose upper to lower triangular + DO 1948 p = 1, NR - 1 + CALL CCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 ) + CALL CLACGV( NR-p+1, A(p,p), 1 ) + 1948 CONTINUE +* + END IF +* +* Row-cyclic Jacobi SVD algorithm with column pivoting +* +* .. again some perturbation (a "background noise") is added +* to drown denormals + IF ( L2PERT ) THEN +* XSC = SQRT(SMALL) + XSC = EPSLN / REAL(N) + DO 1947 q = 1, NR + CTEMP = CMPLX(XSC*ABS(A(q,q)),ZERO) + DO 1949 p = 1, NR + IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) +* $ A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) ) + $ A(p,q) = CTEMP + 1949 CONTINUE + 1947 CONTINUE + ELSE + CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, A(1,2), LDA ) + END IF +* +* .. and one-sided Jacobi rotations are started on a lower +* triangular matrix (plus perturbation which is ignored in +* the part which destroys triangular form (confusing?!)) +* + CALL CGESVJ( 'L', 'N', 'N', NR, NR, A, LDA, SVA, + $ N, V, LDV, CWORK, LWORK, RWORK, LRWORK, INFO ) +* + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) +* +* + ELSE IF ( ( RSVEC .AND. ( .NOT. LSVEC ) .AND. ( .NOT. JRACC ) ) + $ .OR. + $ ( JRACC .AND. ( .NOT. LSVEC ) .AND. ( NR .NE. N ) ) ) THEN +* +* -> Singular Values and Right Singular Vectors <- +* + IF ( ALMORT ) THEN +* +* .. in this case NR equals N + DO 1998 p = 1, NR + CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL CLACGV( N-p+1, V(p,p), 1 ) + 1998 CONTINUE + CALL CLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV ) +* + CALL CGESVJ( 'L','U','N', N, NR, V, LDV, SVA, NR, A, LDA, + $ CWORK, LWORK, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + + ELSE +* +* .. two more QR factorizations ( one QRF is not enough, two require +* accumulated product of Jacobi rotations, three are perfect ) +* + CALL CLASET( 'L', NR-1,NR-1, CZERO, CZERO, A(2,1), LDA ) + CALL CGELQF( NR,N, A, LDA, CWORK, CWORK(N+1), LWORK-N, IERR) + CALL CLACPY( 'L', NR, NR, A, LDA, V, LDV ) + CALL CLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV ) + CALL CGEQRF( NR, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) + DO 8998 p = 1, NR + CALL CCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 ) + CALL CLACGV( NR-p+1, V(p,p), 1 ) + 8998 CONTINUE + CALL CLASET('U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV) +* + CALL CGESVJ( 'L', 'U','N', NR, NR, V,LDV, SVA, NR, U, + $ LDU, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + IF ( NR .LT. N ) THEN + CALL CLASET( 'A',N-NR, NR, CZERO,CZERO, V(NR+1,1), LDV ) + CALL CLASET( 'A',NR, N-NR, CZERO,CZERO, V(1,NR+1), LDV ) + CALL CLASET( 'A',N-NR,N-NR,CZERO,CONE, V(NR+1,NR+1),LDV ) + END IF +* + CALL CUNMLQ( 'L', 'C', N, N, NR, A, LDA, CWORK, + $ V, LDV, CWORK(N+1), LWORK-N, IERR ) +* + END IF +* .. permute the rows of V +* DO 8991 p = 1, N +* CALL CCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA ) +* 8991 CONTINUE +* CALL CLACPY( 'All', N, N, A, LDA, V, LDV ) + CALL CLAPMR( .FALSE., N, N, V, LDV, IWORK ) +* + IF ( TRANSP ) THEN + CALL CLACPY( 'A', N, N, V, LDV, U, LDU ) + END IF +* + ELSE IF ( JRACC .AND. (.NOT. LSVEC) .AND. ( NR.EQ. N ) ) THEN +* + CALL CLASET( 'L', N-1,N-1, CZERO, CZERO, A(2,1), LDA ) +* + CALL CGESVJ( 'U','N','V', N, N, A, LDA, SVA, N, V, LDV, + $ CWORK, LWORK, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + CALL CLAPMR( .FALSE., N, N, V, LDV, IWORK ) +* + ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN +* +* .. Singular Values and Left Singular Vectors .. +* +* .. second preconditioning step to avoid need to accumulate +* Jacobi rotations in the Jacobi iterations. + DO 1965 p = 1, NR + CALL CCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 ) + CALL CLACGV( N-p+1, U(p,p), 1 ) + 1965 CONTINUE + CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) +* + CALL CGEQRF( N, NR, U, LDU, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) +* + DO 1967 p = 1, NR - 1 + CALL CCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 ) + CALL CLACGV( N-p+1, U(p,p), 1 ) + 1967 CONTINUE + CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) +* + CALL CGESVJ( 'L', 'U', 'N', NR,NR, U, LDU, SVA, NR, A, + $ LDA, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) +* + IF ( NR .LT. M ) THEN + CALL CLASET( 'A', M-NR, NR,CZERO, CZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL CLASET( 'A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU ) + CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE,U(NR+1,NR+1),LDU ) + END IF + END IF +* + CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + DO 1974 p = 1, N1 + XSC = ONE / SCNRM2( M, U(1,p), 1 ) + CALL CSSCAL( M, XSC, U(1,p), 1 ) + 1974 CONTINUE +* + IF ( TRANSP ) THEN + CALL CLACPY( 'A', N, N, U, LDU, V, LDV ) + END IF +* + ELSE +* +* .. Full SVD .. +* + IF ( .NOT. JRACC ) THEN +* + IF ( .NOT. ALMORT ) THEN +* +* Second Preconditioning Step (QRF [with pivoting]) +* Note that the composition of TRANSPOSE, QRF and TRANSPOSE is +* equivalent to an LQF CALL. Since in many libraries the QRF +* seems to be better optimized than the LQF, we do explicit +* transpose and use the QRF. This is subject to changes in an +* optimized implementation of CGEJSV. +* + DO 1968 p = 1, NR + CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL CLACGV( N-p+1, V(p,p), 1 ) + 1968 CONTINUE +* +* .. the following two loops perturb small entries to avoid +* denormals in the second QR factorization, where they are +* as good as zeros. This is done to avoid painfully slow +* computation with denormals. The relative size of the perturbation +* is a parameter that can be changed by the implementer. +* This perturbation device will be obsolete on machines with +* properly implemented arithmetic. +* To switch it off, set L2PERT=.FALSE. To remove it from the +* code, remove the action under L2PERT=.TRUE., leave the ELSE part. +* The following two loops should be blocked and fused with the +* transposed copy above. +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 2969 q = 1, NR + CTEMP = CMPLX(XSC*ABS( V(q,q) ),ZERO) + DO 2968 p = 1, N + IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) +* $ V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) ) + $ V(p,q) = CTEMP + IF ( p .LT. q ) V(p,q) = - V(p,q) + 2968 CONTINUE + 2969 CONTINUE + ELSE + CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV ) + END IF +* +* Estimate the row scaled condition number of R1 +* (If R1 is rectangular, N > NR, then the condition number +* of the leading NR x NR submatrix is estimated.) +* + CALL CLACPY( 'L', NR, NR, V, LDV, CWORK(2*N+1), NR ) + DO 3950 p = 1, NR + TEMP1 = SCNRM2(NR-p+1,CWORK(2*N+(p-1)*NR+p),1) + CALL CSSCAL(NR-p+1,ONE/TEMP1,CWORK(2*N+(p-1)*NR+p),1) + 3950 CONTINUE + CALL CPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1, + $ CWORK(2*N+NR*NR+1),RWORK,IERR) + CONDR1 = ONE / SQRT(TEMP1) +* .. here need a second opinion on the condition number +* .. then assume worst case scenario +* R1 is OK for inverse <=> CONDR1 .LT. REAL(N) +* more conservative <=> CONDR1 .LT. SQRT(REAL(N)) +* + COND_OK = SQRT(SQRT(REAL(NR))) +*[TP] COND_OK is a tuning parameter. +* + IF ( CONDR1 .LT. COND_OK ) THEN +* .. the second QRF without pivoting. Note: in an optimized +* implementation, this QRF should be implemented as the QRF +* of a lower triangular matrix. +* R1^* = Q2 * R2 + CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL)/EPSLN + DO 3959 p = 2, NR + DO 3958 q = 1, p - 1 + CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) + IF ( ABS(V(q,p)) .LE. TEMP1 ) +* $ V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) ) + $ V(q,p) = CTEMP + 3958 CONTINUE + 3959 CONTINUE + END IF +* + IF ( NR .NE. N ) + $ CALL CLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N ) +* .. save ... +* +* .. this transposed copy should be better than naive + DO 1969 p = 1, NR - 1 + CALL CCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 ) + CALL CLACGV(NR-p+1, V(p,p), 1 ) + 1969 CONTINUE + V(NR,NR)=CONJG(V(NR,NR)) +* + CONDR2 = CONDR1 +* + ELSE +* +* .. ill-conditioned case: second QRF with pivoting +* Note that windowed pivoting would be equally good +* numerically, and more run-time efficient. So, in +* an optimal implementation, the next call to CGEQP3 +* should be replaced with eg. CALL CGEQPX (ACM TOMS #782) +* with properly (carefully) chosen parameters. +* +* R1^* * P2 = Q2 * R2 + DO 3003 p = 1, NR + IWORK(N+p) = 0 + 3003 CONTINUE + CALL CGEQP3( N, NR, V, LDV, IWORK(N+1), CWORK(N+1), + $ CWORK(2*N+1), LWORK-2*N, RWORK, IERR ) +** CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), +** $ LWORK-2*N, IERR ) + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 3969 p = 2, NR + DO 3968 q = 1, p - 1 + CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) + IF ( ABS(V(q,p)) .LE. TEMP1 ) +* $ V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) ) + $ V(q,p) = CTEMP + 3968 CONTINUE + 3969 CONTINUE + END IF +* + CALL CLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N ) +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 8970 p = 2, NR + DO 8971 q = 1, p - 1 + CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) +* V(p,q) = - TEMP1*( V(q,p) / ABS(V(q,p)) ) + V(p,q) = - CTEMP + 8971 CONTINUE + 8970 CONTINUE + ELSE + CALL CLASET( 'L',NR-1,NR-1,CZERO,CZERO,V(2,1),LDV ) + END IF +* Now, compute R2 = L3 * Q3, the LQ factorization. + CALL CGELQF( NR, NR, V, LDV, CWORK(2*N+N*NR+1), + $ CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR ) +* .. and estimate the condition number + CALL CLACPY( 'L',NR,NR,V,LDV,CWORK(2*N+N*NR+NR+1),NR ) + DO 4950 p = 1, NR + TEMP1 = SCNRM2( p, CWORK(2*N+N*NR+NR+p), NR ) + CALL CSSCAL( p, ONE/TEMP1, CWORK(2*N+N*NR+NR+p), NR ) + 4950 CONTINUE + CALL CPOCON( 'L',NR,CWORK(2*N+N*NR+NR+1),NR,ONE,TEMP1, + $ CWORK(2*N+N*NR+NR+NR*NR+1),RWORK,IERR ) + CONDR2 = ONE / SQRT(TEMP1) +* +* + IF ( CONDR2 .GE. COND_OK ) THEN +* .. save the Householder vectors used for Q3 +* (this overwrites the copy of R2, as it will not be +* needed in this branch, but it does not overwritte the +* Huseholder vectors of Q2.). + CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N ) +* .. and the rest of the information on Q3 is in +* WORK(2*N+N*NR+1:2*N+N*NR+N) + END IF +* + END IF +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 4968 q = 2, NR + CTEMP = XSC * V(q,q) + DO 4969 p = 1, q - 1 +* V(p,q) = - TEMP1*( V(p,q) / ABS(V(p,q)) ) + V(p,q) = - CTEMP + 4969 CONTINUE + 4968 CONTINUE + ELSE + CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV ) + END IF +* +* Second preconditioning finished; continue with Jacobi SVD +* The input matrix is lower trinagular. +* +* Recover the right singular vectors as solution of a well +* conditioned triangular matrix equation. +* + IF ( CONDR1 .LT. COND_OK ) THEN +* + CALL CGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, LDU, + $ CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,RWORK, + $ LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 3970 p = 1, NR + CALL CCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL CSSCAL( NR, SVA(p), V(1,p), 1 ) + 3970 CONTINUE + +* .. pick the right matrix equation and solve it +* + IF ( NR .EQ. N ) THEN +* :)) .. best case, R1 is inverted. The solution of this matrix +* equation is Q2*V2 = the product of the Jacobi rotations +* used in CGESVJ, premultiplied with the orthogonal matrix +* from the second QR factorization. + CALL CTRSM('L','U','N','N', NR,NR,CONE, A,LDA, V,LDV) + ELSE +* .. R1 is well conditioned, but non-square. Adjoint of R2 +* is inverted to get the product of the Jacobi rotations +* used in CGESVJ. The Q-factor from the second QR +* factorization is then built in explicitly. + CALL CTRSM('L','U','C','N',NR,NR,CONE,CWORK(2*N+1), + $ N,V,LDV) + IF ( NR .LT. N ) THEN + CALL CLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV) + CALL CLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV) + CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL CUNMQR('L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR) + END IF +* + ELSE IF ( CONDR2 .LT. COND_OK ) THEN +* +* The matrix R2 is inverted. The solution of the matrix equation +* is Q3^* * V3 = the product of the Jacobi rotations (appplied to +* the lower triangular L3 from the LQ factorization of +* R2=L3*Q3), pre-multiplied with the transposed Q3. + CALL CGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, + $ LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 3870 p = 1, NR + CALL CCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL CSSCAL( NR, SVA(p), U(1,p), 1 ) + 3870 CONTINUE + CALL CTRSM('L','U','N','N',NR,NR,CONE,CWORK(2*N+1),N, + $ U,LDU) +* .. apply the permutation from the second QR factorization + DO 873 q = 1, NR + DO 872 p = 1, NR + CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 872 CONTINUE + DO 874 p = 1, NR + U(p,q) = CWORK(2*N+N*NR+NR+p) + 874 CONTINUE + 873 CONTINUE + IF ( NR .LT. N ) THEN + CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) + ELSE +* Last line of defense. +* #:( This is a rather pathological case: no scaled condition +* improvement after two pivoted QR factorizations. Other +* possibility is that the rank revealing QR factorization +* or the condition estimator has failed, or the COND_OK +* is set very close to ONE (which is unnecessary). Normally, +* this branch should never be executed, but in rare cases of +* failure of the RRQR or condition estimator, the last line of +* defense ensures that CGEJSV completes the task. +* Compute the full SVD of L3 using CGESVJ with explicit +* accumulation of Jacobi rotations. + CALL CGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U, + $ LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + IF ( NR .LT. N ) THEN + CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* + CALL CUNMLQ( 'L', 'C', NR, NR, NR, CWORK(2*N+1), N, + $ CWORK(2*N+N*NR+1), U, LDU, CWORK(2*N+N*NR+NR+1), + $ LWORK-2*N-N*NR-NR, IERR ) + DO 773 q = 1, NR + DO 772 p = 1, NR + CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 772 CONTINUE + DO 774 p = 1, NR + U(p,q) = CWORK(2*N+N*NR+NR+p) + 774 CONTINUE + 773 CONTINUE +* + END IF +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = SQRT(REAL(N)) * EPSLN + DO 1972 q = 1, N + DO 972 p = 1, N + CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 972 CONTINUE + DO 973 p = 1, N + V(p,q) = CWORK(2*N+N*NR+NR+p) + 973 CONTINUE + XSC = ONE / SCNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL CSSCAL( N, XSC, V(1,q), 1 ) + 1972 CONTINUE +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). + IF ( NR .LT. M ) THEN + CALL CLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL CLASET('A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1),LDU) + END IF + END IF +* +* The Q matrix from the first QRF is built into the left singular +* matrix U. This applies to all cases. +* + CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) + +* The columns of U are normalized. The cost is O(M*N) flops. + TEMP1 = SQRT(REAL(M)) * EPSLN + DO 1973 p = 1, NR + XSC = ONE / SCNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL CSSCAL( M, XSC, U(1,p), 1 ) + 1973 CONTINUE +* +* If the initial QRF is computed with row pivoting, the left +* singular vectors must be adjusted. +* + IF ( ROWPIV ) + $ CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + ELSE +* +* .. the initial matrix A has almost orthogonal columns and +* the second QRF is not needed +* + CALL CLACPY( 'U', N, N, A, LDA, CWORK(N+1), N ) + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 5970 p = 2, N + CTEMP = XSC * CWORK( N + (p-1)*N + p ) + DO 5971 q = 1, p - 1 +* CWORK(N+(q-1)*N+p)=-TEMP1 * ( CWORK(N+(p-1)*N+q) / +* $ ABS(CWORK(N+(p-1)*N+q)) ) + CWORK(N+(q-1)*N+p)=-CTEMP + 5971 CONTINUE + 5970 CONTINUE + ELSE + CALL CLASET( 'L',N-1,N-1,CZERO,CZERO,CWORK(N+2),N ) + END IF +* + CALL CGESVJ( 'U', 'U', 'N', N, N, CWORK(N+1), N, SVA, + $ N, U, LDU, CWORK(N+N*N+1), LWORK-N-N*N, RWORK, LRWORK, + $ INFO ) +* + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 6970 p = 1, N + CALL CCOPY( N, CWORK(N+(p-1)*N+1), 1, U(1,p), 1 ) + CALL CSSCAL( N, SVA(p), CWORK(N+(p-1)*N+1), 1 ) + 6970 CONTINUE +* + CALL CTRSM( 'L', 'U', 'N', 'N', N, N, + $ CONE, A, LDA, CWORK(N+1), N ) + DO 6972 p = 1, N + CALL CCOPY( N, CWORK(N+p), N, V(IWORK(p),1), LDV ) + 6972 CONTINUE + TEMP1 = SQRT(REAL(N))*EPSLN + DO 6971 p = 1, N + XSC = ONE / SCNRM2( N, V(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL CSSCAL( N, XSC, V(1,p), 1 ) + 6971 CONTINUE +* +* Assemble the left singular vector matrix U (M x N). +* + IF ( N .LT. M ) THEN + CALL CLASET( 'A', M-N, N, CZERO, CZERO, U(N+1,1), LDU ) + IF ( N .LT. N1 ) THEN + CALL CLASET('A',N, N1-N, CZERO, CZERO, U(1,N+1),LDU) + CALL CLASET( 'A',M-N,N1-N, CZERO, CONE,U(N+1,N+1),LDU) + END IF + END IF + CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) + TEMP1 = SQRT(REAL(M))*EPSLN + DO 6973 p = 1, N1 + XSC = ONE / SCNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL CSSCAL( M, XSC, U(1,p), 1 ) + 6973 CONTINUE +* + IF ( ROWPIV ) + $ CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + END IF +* +* end of the >> almost orthogonal case << in the full SVD +* + ELSE +* +* This branch deploys a preconditioned Jacobi SVD with explicitly +* accumulated rotations. It is included as optional, mainly for +* experimental purposes. It does perform well, and can also be used. +* In this implementation, this branch will be automatically activated +* if the condition number sigma_max(A) / sigma_min(A) is predicted +* to be greater than the overflow threshold. This is because the +* a posteriori computation of the singular vectors assumes robust +* implementation of BLAS and some LAPACK procedures, capable of working +* in presence of extreme values, e.g. when the singular values spread from +* the underflow to the overflow threshold. +* + DO 7968 p = 1, NR + CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL CLACGV( N-p+1, V(p,p), 1 ) + 7968 CONTINUE +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL/EPSLN) + DO 5969 q = 1, NR + CTEMP = CMPLX(XSC*ABS( V(q,q) ),ZERO) + DO 5968 p = 1, N + IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) +* $ V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) ) + $ V(p,q) = CTEMP + IF ( p .LT. q ) V(p,q) = - V(p,q) + 5968 CONTINUE + 5969 CONTINUE + ELSE + CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV ) + END IF + + CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) + CALL CLACPY( 'L', N, NR, V, LDV, CWORK(2*N+1), N ) +* + DO 7969 p = 1, NR + CALL CCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 ) + CALL CLACGV( NR-p+1, U(p,p), 1 ) + 7969 CONTINUE + + IF ( L2PERT ) THEN + XSC = SQRT(SMALL/EPSLN) + DO 9970 q = 2, NR + DO 9971 p = 1, q - 1 + CTEMP = CMPLX(XSC * MIN(ABS(U(p,p)),ABS(U(q,q))), + $ ZERO) +* U(p,q) = - TEMP1 * ( U(q,p) / ABS(U(q,p)) ) + U(p,q) = - CTEMP + 9971 CONTINUE + 9970 CONTINUE + ELSE + CALL CLASET('U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) + END IF + + CALL CGESVJ( 'L', 'U', 'V', NR, NR, U, LDU, SVA, + $ N, V, LDV, CWORK(2*N+N*NR+1), LWORK-2*N-N*NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + + IF ( NR .LT. N ) THEN + CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL CLASET( 'A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV ) + END IF + + CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = SQRT(REAL(N)) * EPSLN + DO 7972 q = 1, N + DO 8972 p = 1, N + CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 8972 CONTINUE + DO 8973 p = 1, N + V(p,q) = CWORK(2*N+N*NR+NR+p) + 8973 CONTINUE + XSC = ONE / SCNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL CSSCAL( N, XSC, V(1,q), 1 ) + 7972 CONTINUE +* +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). +* + IF ( NR .LT. M ) THEN + CALL CLASET( 'A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL CLASET('A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU) + CALL CLASET('A',M-NR,N1-NR, CZERO, CONE,U(NR+1,NR+1),LDU) + END IF + END IF +* + CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* +* + END IF + IF ( TRANSP ) THEN +* .. swap U and V because the procedure worked on A^* + DO 6974 p = 1, N + CALL CSWAP( N, U(1,p), 1, V(1,p), 1 ) + 6974 CONTINUE + END IF +* + END IF +* end of the full SVD +* +* Undo scaling, if necessary (and possible) +* + IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN + CALL SLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR ) + USCAL1 = ONE + USCAL2 = ONE + END IF +* + IF ( NR .LT. N ) THEN + DO 3004 p = NR+1, N + SVA(p) = ZERO + 3004 CONTINUE + END IF +* + RWORK(1) = USCAL2 * SCALEM + RWORK(2) = USCAL1 + IF ( ERREST ) RWORK(3) = SCONDA + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = CONDR1 + RWORK(5) = CONDR2 + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ENTRA + RWORK(7) = ENTRAT + END IF +* + IWORK(1) = NR + IWORK(2) = NUMRANK + IWORK(3) = WARNING + IF ( TRANSP ) THEN + IWORK(4) = 1 + ELSE + IWORK(4) = -1 + END IF + +* + RETURN +* .. +* .. END OF CGEJSV +* .. + END +* diff --git a/lapack-netlib/cgesvx.f b/lapack-netlib/cgesvx.f new file mode 100644 index 000000000..74a37e9a0 --- /dev/null +++ b/lapack-netlib/cgesvx.f @@ -0,0 +1,602 @@ +*> \brief CGESVX computes the solution to system of linear equations A * X = B for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGESVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, +* EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, +* WORK, RWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS +* REAL RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ) +* REAL BERR( * ), C( * ), FERR( * ), R( * ), +* $ RWORK( * ) +* COMPLEX A( LDA, * ), AF( LDAF, * ), B( LDB, * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGESVX uses the LU factorization to compute the solution to a complex +*> system of linear equations +*> A * X = B, +*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = P * L * U, +*> where P is a permutation matrix, L is a unit lower triangular +*> matrix, and U is upper triangular. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AF and IPIV contain the factored form of A. +*> If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> A, AF, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AF and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AF and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations: +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the N-by-N matrix A. If FACT = 'F' and EQUED is +*> not 'N', then A must have been equilibrated by the scaling +*> factors in R and/or C. A is not modified if FACT = 'F' or +*> 'N', or if FACT = 'E' and EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] AF +*> \verbatim +*> AF is COMPLEX array, dimension (LDAF,N) +*> If FACT = 'F', then AF is an input argument and on entry +*> contains the factors L and U from the factorization +*> A = P*L*U as computed by CGETRF. If EQUED .ne. 'N', then +*> AF is the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the equilibrated matrix A (see the description of A for +*> the form of the equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAF +*> \verbatim +*> LDAF is INTEGER +*> The leading dimension of the array AF. LDAF >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = P*L*U +*> as computed by CGETRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is REAL array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,NRHS) +*> On entry, the N-by-NRHS right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is COMPLEX array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is REAL +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is REAL array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is REAL array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (MAX(1,2*N)) +*> On exit, RWORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If RWORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 RWORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization has +*> been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexGEsolve +* +* ===================================================================== + SUBROUTINE CGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, + $ EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, + $ WORK, RWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS + REAL RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL BERR( * ), C( * ), FERR( * ), R( * ), + $ RWORK( * ) + COMPLEX A( LDA, * ), AF( LDAF, * ), B( LDB, * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J + REAL AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + REAL CLANGE, CLANTR, SLAMCH + EXTERNAL LSAME, CLANGE, CLANTR, SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL CGECON, CGEEQU, CGERFS, CGETRF, CGETRS, CLACPY, + $ CLAQGE, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = SLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -10 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -11 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -12 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -14 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -16 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGESVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL CGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL CLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, + $ EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of A. +* + CALL CLACPY( 'Full', N, N, A, LDA, AF, LDAF ) + CALL CGETRF( N, N, AF, LDAF, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + RPVGRW = CLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF, + $ RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = CLANGE( 'M', N, INFO, A, LDA, RWORK ) / + $ RPVGRW + END IF + RWORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = CLANGE( NORM, N, N, A, LDA, RWORK ) + RPVGRW = CLANTR( 'M', 'U', 'N', N, N, AF, LDAF, RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = CLANGE( 'M', N, N, A, LDA, RWORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL CGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, RWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL CLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL CGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL CGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X, + $ LDX, FERR, BERR, WORK, RWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 80 J = 1, NRHS + DO 70 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 70 CONTINUE + 80 CONTINUE + DO 90 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 90 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 120 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + RWORK( 1 ) = RPVGRW + RETURN +* +* End of CGESVX +* + END diff --git a/lapack-netlib/dgbsvx.f b/lapack-netlib/dgbsvx.f new file mode 100644 index 000000000..0ee5eecb3 --- /dev/null +++ b/lapack-netlib/dgbsvx.f @@ -0,0 +1,639 @@ +*> \brief DGBSVX computes the solution to system of linear equations A * X = B for GB matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGBSVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, +* LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, +* RCOND, FERR, BERR, WORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS +* DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ), IWORK( * ) +* DOUBLE PRECISION AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), +* $ BERR( * ), C( * ), FERR( * ), R( * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGBSVX uses the LU factorization to compute the solution to a real +*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B, +*> where A is a band matrix of order N with KL subdiagonals and KU +*> superdiagonals, and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed by this subroutine: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = L * U, +*> where L is a product of permutation and unit lower triangular +*> matrices with KL subdiagonals, and U is upper triangular with +*> KL+KU superdiagonals. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AFB and IPIV contain the factored form of +*> A. If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> AB, AFB, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AFB and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AFB and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations. +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KL +*> \verbatim +*> KL is INTEGER +*> The number of subdiagonals within the band of A. KL >= 0. +*> \endverbatim +*> +*> \param[in] KU +*> \verbatim +*> KU is INTEGER +*> The number of superdiagonals within the band of A. KU >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is DOUBLE PRECISION array, dimension (LDAB,N) +*> On entry, the matrix A in band storage, in rows 1 to KL+KU+1. +*> The j-th column of A is stored in the j-th column of the +*> array AB as follows: +*> AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl) +*> +*> If FACT = 'F' and EQUED is not 'N', then A must have been +*> equilibrated by the scaling factors in R and/or C. AB is not +*> modified if FACT = 'F' or 'N', or if FACT = 'E' and +*> EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] AFB +*> \verbatim +*> AFB is DOUBLE PRECISION array, dimension (LDAFB,N) +*> If FACT = 'F', then AFB is an input argument and on entry +*> contains details of the LU factorization of the band matrix +*> A, as computed by DGBTRF. U is stored as an upper triangular +*> band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1, +*> and the multipliers used during the factorization are stored +*> in rows KL+KU+2 to 2*KL+KU+1. If EQUED .ne. 'N', then AFB is +*> the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AFB is an output argument and on exit +*> returns details of the LU factorization of A. +*> +*> If FACT = 'E', then AFB is an output argument and on exit +*> returns details of the LU factorization of the equilibrated +*> matrix A (see the description of AB for the form of the +*> equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAFB +*> \verbatim +*> LDAFB is INTEGER +*> The leading dimension of the array AFB. LDAFB >= 2*KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = L*U +*> as computed by DGBTRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is DOUBLE PRECISION array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,NRHS) +*> On entry, the right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is DOUBLE PRECISION array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is DOUBLE PRECISION +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is DOUBLE PRECISION array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is DOUBLE PRECISION array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,3*N)) +*> On exit, WORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If WORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 WORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization +*> has been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleGBsolve +* +* ===================================================================== + SUBROUTINE DGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, + $ LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, + $ RCOND, FERR, BERR, WORK, IWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS + DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ), IWORK( * ) + DOUBLE PRECISION AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), + $ BERR( * ), C( * ), FERR( * ), R( * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J, J1, J2 + DOUBLE PRECISION AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DLAMCH, DLANGB, DLANTB + EXTERNAL LSAME, DLAMCH, DLANGB, DLANTB +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DGBCON, DGBEQU, DGBRFS, DGBTRF, DGBTRS, + $ DLACPY, DLAQGB, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = DLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KL.LT.0 ) THEN + INFO = -4 + ELSE IF( KU.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDAB.LT.KL+KU+1 ) THEN + INFO = -8 + ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN + INFO = -10 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -12 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -13 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -14 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -16 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -18 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGBSVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL DGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL DLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of the band matrix A. +* + DO 70 J = 1, N + J1 = MAX( J-KU, 1 ) + J2 = MIN( J+KL, N ) + CALL DCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1, + $ AFB( KL+KU+1-J+J1, J ), 1 ) + 70 CONTINUE +* + CALL DGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + ANORM = ZERO + DO 90 J = 1, INFO + DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 ) + ANORM = MAX( ANORM, ABS( AB( I, J ) ) ) + 80 CONTINUE + 90 CONTINUE + RPVGRW = DLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ), + $ AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB, + $ WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ANORM / RPVGRW + END IF + WORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = DLANGB( NORM, N, KL, KU, AB, LDAB, WORK ) + RPVGRW = DLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = DLANGB( 'M', N, KL, KU, AB, LDAB, WORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL DGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND, + $ WORK, IWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL DLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL DGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX, + $ INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL DGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV, + $ B, LDB, X, LDX, FERR, BERR, WORK, IWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 120 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 140 J = 1, NRHS + DO 130 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 130 CONTINUE + 140 CONTINUE + DO 150 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 150 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.DLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + WORK( 1 ) = RPVGRW + RETURN +* +* End of DGBSVX +* + END diff --git a/lapack-netlib/dgejsv.f b/lapack-netlib/dgejsv.f new file mode 100644 index 000000000..ee769bb38 --- /dev/null +++ b/lapack-netlib/dgejsv.f @@ -0,0 +1,1780 @@ +*> \brief \b DGEJSV +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGEJSV + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, +* M, N, A, LDA, SVA, U, LDU, V, LDV, +* WORK, LWORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* IMPLICIT NONE +* INTEGER INFO, LDA, LDU, LDV, LWORK, M, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), SVA( N ), U( LDU, * ), V( LDV, * ), +* $ WORK( LWORK ) +* INTEGER IWORK( * ) +* CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGEJSV computes the singular value decomposition (SVD) of a real M-by-N +*> matrix [A], where M >= N. The SVD of [A] is written as +*> +*> [A] = [U] * [SIGMA] * [V]^t, +*> +*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N +*> diagonal elements, [U] is an M-by-N (or M-by-M) orthonormal matrix, and +*> [V] is an N-by-N orthogonal matrix. The diagonal elements of [SIGMA] are +*> the singular values of [A]. The columns of [U] and [V] are the left and +*> the right singular vectors of [A], respectively. The matrices [U] and [V] +*> are computed and stored in the arrays U and V, respectively. The diagonal +*> of [SIGMA] is computed and stored in the array SVA. +*> DGEJSV can sometimes compute tiny singular values and their singular vectors much +*> more accurately than other SVD routines, see below under Further Details. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] JOBA +*> \verbatim +*> JOBA is CHARACTER*1 +*> Specifies the level of accuracy: +*> = 'C': This option works well (high relative accuracy) if A = B * D, +*> with well-conditioned B and arbitrary diagonal matrix D. +*> The accuracy cannot be spoiled by COLUMN scaling. The +*> accuracy of the computed output depends on the condition of +*> B, and the procedure aims at the best theoretical accuracy. +*> The relative error max_{i=1:N}|d sigma_i| / sigma_i is +*> bounded by f(M,N)*epsilon* cond(B), independent of D. +*> The input matrix is preprocessed with the QRF with column +*> pivoting. This initial preprocessing and preconditioning by +*> a rank revealing QR factorization is common for all values of +*> JOBA. Additional actions are specified as follows: +*> = 'E': Computation as with 'C' with an additional estimate of the +*> condition number of B. It provides a realistic error bound. +*> = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings +*> D1, D2, and well-conditioned matrix C, this option gives +*> higher accuracy than the 'C' option. If the structure of the +*> input matrix is not known, and relative accuracy is +*> desirable, then this option is advisable. The input matrix A +*> is preprocessed with QR factorization with FULL (row and +*> column) pivoting. +*> = 'G': Computation as with 'F' with an additional estimate of the +*> condition number of B, where A=D*B. If A has heavily weighted +*> rows, then using this condition number gives too pessimistic +*> error bound. +*> = 'A': Small singular values are the noise and the matrix is treated +*> as numerically rank deficient. The error in the computed +*> singular values is bounded by f(m,n)*epsilon*||A||. +*> The computed SVD A = U * S * V^t restores A up to +*> f(m,n)*epsilon*||A||. +*> This gives the procedure the licence to discard (set to zero) +*> all singular values below N*epsilon*||A||. +*> = 'R': Similar as in 'A'. Rank revealing property of the initial +*> QR factorization is used do reveal (using triangular factor) +*> a gap sigma_{r+1} < epsilon * sigma_r in which case the +*> numerical RANK is declared to be r. The SVD is computed with +*> absolute error bounds, but more accurately than with 'A'. +*> \endverbatim +*> +*> \param[in] JOBU +*> \verbatim +*> JOBU is CHARACTER*1 +*> Specifies whether to compute the columns of U: +*> = 'U': N columns of U are returned in the array U. +*> = 'F': full set of M left sing. vectors is returned in the array U. +*> = 'W': U may be used as workspace of length M*N. See the description +*> of U. +*> = 'N': U is not computed. +*> \endverbatim +*> +*> \param[in] JOBV +*> \verbatim +*> JOBV is CHARACTER*1 +*> Specifies whether to compute the matrix V: +*> = 'V': N columns of V are returned in the array V; Jacobi rotations +*> are not explicitly accumulated. +*> = 'J': N columns of V are returned in the array V, but they are +*> computed as the product of Jacobi rotations. This option is +*> allowed only if JOBU .NE. 'N', i.e. in computing the full SVD. +*> = 'W': V may be used as workspace of length N*N. See the description +*> of V. +*> = 'N': V is not computed. +*> \endverbatim +*> +*> \param[in] JOBR +*> \verbatim +*> JOBR is CHARACTER*1 +*> Specifies the RANGE for the singular values. Issues the licence to +*> set to zero small positive singular values if they are outside +*> specified range. If A .NE. 0 is scaled so that the largest singular +*> value of c*A is around DSQRT(BIG), BIG=SLAMCH('O'), then JOBR issues +*> the licence to kill columns of A whose norm in c*A is less than +*> DSQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN, +*> where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E'). +*> = 'N': Do not kill small columns of c*A. This option assumes that +*> BLAS and QR factorizations and triangular solvers are +*> implemented to work in that range. If the condition of A +*> is greater than BIG, use DGESVJ. +*> = 'R': RESTRICTED range for sigma(c*A) is [DSQRT(SFMIN), DSQRT(BIG)] +*> (roughly, as described above). This option is recommended. +*> ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*> For computing the singular values in the FULL range [SFMIN,BIG] +*> use DGESVJ. +*> \endverbatim +*> +*> \param[in] JOBT +*> \verbatim +*> JOBT is CHARACTER*1 +*> If the matrix is square then the procedure may determine to use +*> transposed A if A^t seems to be better with respect to convergence. +*> If the matrix is not square, JOBT is ignored. This is subject to +*> changes in the future. +*> The decision is based on two values of entropy over the adjoint +*> orbit of A^t * A. See the descriptions of WORK(6) and WORK(7). +*> = 'T': transpose if entropy test indicates possibly faster +*> convergence of Jacobi process if A^t is taken as input. If A is +*> replaced with A^t, then the row pivoting is included automatically. +*> = 'N': do not speculate. +*> This option can be used to compute only the singular values, or the +*> full SVD (U, SIGMA and V). For only one set of singular vectors +*> (U or V), the caller should provide both U and V, as one of the +*> matrices is used as workspace if the matrix A is transposed. +*> The implementer can easily remove this constraint and make the +*> code more complicated. See the descriptions of U and V. +*> \endverbatim +*> +*> \param[in] JOBP +*> \verbatim +*> JOBP is CHARACTER*1 +*> Issues the licence to introduce structured perturbations to drown +*> denormalized numbers. This licence should be active if the +*> denormals are poorly implemented, causing slow computation, +*> especially in cases of fast convergence (!). For details see [1,2]. +*> For the sake of simplicity, this perturbations are included only +*> when the full SVD or only the singular values are requested. The +*> implementer/user can easily add the perturbation for the cases of +*> computing one set of singular vectors. +*> = 'P': introduce perturbation +*> = 'N': do not perturb +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the input matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the input matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] SVA +*> \verbatim +*> SVA is DOUBLE PRECISION array, dimension (N) +*> On exit, +*> - For WORK(1)/WORK(2) = ONE: The singular values of A. During the +*> computation SVA contains Euclidean column norms of the +*> iterated matrices in the array A. +*> - For WORK(1) .NE. WORK(2): The singular values of A are +*> (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if +*> sigma_max(A) overflows or if small singular values have been +*> saved from underflow by scaling the input matrix A. +*> - If JOBR='R' then some of the singular values may be returned +*> as exact zeros obtained by "set to zero" because they are +*> below the numerical rank threshold or are denormalized numbers. +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is DOUBLE PRECISION array, dimension ( LDU, N ) or ( LDU, M ) +*> If JOBU = 'U', then U contains on exit the M-by-N matrix of +*> the left singular vectors. +*> If JOBU = 'F', then U contains on exit the M-by-M matrix of +*> the left singular vectors, including an ONB +*> of the orthogonal complement of the Range(A). +*> If JOBU = 'W' .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N), +*> then U is used as workspace if the procedure +*> replaces A with A^t. In that case, [V] is computed +*> in U as left singular vectors of A^t and then +*> copied back to the V array. This 'W' option is just +*> a reminder to the caller that in this case U is +*> reserved as workspace of length N*N. +*> If JOBU = 'N' U is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is INTEGER +*> The leading dimension of the array U, LDU >= 1. +*> IF JOBU = 'U' or 'F' or 'W', then LDU >= M. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is DOUBLE PRECISION array, dimension ( LDV, N ) +*> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of +*> the right singular vectors; +*> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), +*> then V is used as workspace if the pprocedure +*> replaces A with A^t. In that case, [U] is computed +*> in V as right singular vectors of A^t and then +*> copied back to the U array. This 'W' option is just +*> a reminder to the caller that in this case V is +*> reserved as workspace of length N*N. +*> If JOBV = 'N' V is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V, LDV >= 1. +*> If JOBV = 'V' or 'J' or 'W', then LDV >= N. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(7,LWORK)) +*> On exit, if N > 0 .AND. M > 0 (else not referenced), +*> WORK(1) = SCALE = WORK(2) / WORK(1) is the scaling factor such +*> that SCALE*SVA(1:N) are the computed singular values +*> of A. (See the description of SVA().) +*> WORK(2) = See the description of WORK(1). +*> WORK(3) = SCONDA is an estimate for the condition number of +*> column equilibrated A. (If JOBA = 'E' or 'G') +*> SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1). +*> It is computed using DPOCON. It holds +*> N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA +*> where R is the triangular factor from the QRF of A. +*> However, if R is truncated and the numerical rank is +*> determined to be strictly smaller than N, SCONDA is +*> returned as -1, thus indicating that the smallest +*> singular values might be lost. +*> +*> If full SVD is needed, the following two condition numbers are +*> useful for the analysis of the algorithm. They are provided for +*> a developer/implementer who is familiar with the details of +*> the method. +*> +*> WORK(4) = an estimate of the scaled condition number of the +*> triangular factor in the first QR factorization. +*> WORK(5) = an estimate of the scaled condition number of the +*> triangular factor in the second QR factorization. +*> The following two parameters are computed if JOBT = 'T'. +*> They are provided for a developer/implementer who is familiar +*> with the details of the method. +*> +*> WORK(6) = the entropy of A^t*A :: this is the Shannon entropy +*> of diag(A^t*A) / Trace(A^t*A) taken as point in the +*> probability simplex. +*> WORK(7) = the entropy of A*A^t. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> Length of WORK to confirm proper allocation of work space. +*> LWORK depends on the job: +*> +*> If only SIGMA is needed (JOBU = 'N', JOBV = 'N') and +*> -> .. no scaled condition estimate required (JOBE = 'N'): +*> LWORK >= max(2*M+N,4*N+1,7). This is the minimal requirement. +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= max(2*M+N,3*N+(N+1)*NB,7). Here NB is the optimal +*> block size for DGEQP3 and DGEQRF. +*> In general, optimal LWORK is computed as +*> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), 7). +*> -> .. an estimate of the scaled condition number of A is +*> required (JOBA='E', 'G'). In this case, LWORK is the maximum +*> of the above and N*N+4*N, i.e. LWORK >= max(2*M+N,N*N+4*N,7). +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= max(2*M+N,3*N+(N+1)*NB, N*N+4*N, 7). +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), +*> N+N*N+LWORK(DPOCON),7). +*> +*> If SIGMA and the right singular vectors are needed (JOBV = 'V'), +*> -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7). +*> -> For optimal performance, LWORK >= max(2*M+N,3*N+(N+1)*NB,7), +*> where NB is the optimal block size for DGEQP3, DGEQRF, DGELQF, +*> DORMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(2*M+N,N+LWORK(DGEQP3), N+LWORK(DPOCON), +*> N+LWORK(DGELQF), 2*N+LWORK(DGEQRF), N+LWORK(DORMLQ)). +*> +*> If SIGMA and the left singular vectors are needed +*> -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7). +*> -> For optimal performance: +*> if JOBU = 'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7), +*> if JOBU = 'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7), +*> where NB is the optimal block size for DGEQP3, DGEQRF, DORMQR. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DPOCON), +*> 2*N+LWORK(DGEQRF), N+LWORK(DORMQR)). +*> Here LWORK(DORMQR) equals N*NB (for JOBU = 'U') or +*> M*NB (for JOBU = 'F'). +*> +*> If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and +*> -> if JOBV = 'V' +*> the minimal requirement is LWORK >= max(2*M+N,6*N+2*N*N). +*> -> if JOBV = 'J' the minimal requirement is +*> LWORK >= max(2*M+N, 4*N+N*N,2*N+N*N+6). +*> -> For optimal performance, LWORK should be additionally +*> larger than N+M*NB, where NB is the optimal block size +*> for DORMQR. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(3,M+3*N)). +*> On exit, +*> IWORK(1) = the numerical rank determined after the initial +*> QR factorization with pivoting. See the descriptions +*> of JOBA and JOBR. +*> IWORK(2) = the number of the computed nonzero singular values +*> IWORK(3) = if nonzero, a warning message: +*> If IWORK(3) = 1 then some of the column norms of A +*> were denormalized floats. The requested high accuracy +*> is not warranted by the data. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> < 0: if INFO = -i, then the i-th argument had an illegal value. +*> = 0: successful exit; +*> > 0: DGEJSV did not converge in the maximal allowed number +*> of sweeps. The computed values may be inaccurate. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleGEsing +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> DGEJSV implements a preconditioned Jacobi SVD algorithm. It uses DGEQP3, +*> DGEQRF, and DGELQF as preprocessors and preconditioners. Optionally, an +*> additional row pivoting can be used as a preprocessor, which in some +*> cases results in much higher accuracy. An example is matrix A with the +*> structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned +*> diagonal matrices and C is well-conditioned matrix. In that case, complete +*> pivoting in the first QR factorizations provides accuracy dependent on the +*> condition number of C, and independent of D1, D2. Such higher accuracy is +*> not completely understood theoretically, but it works well in practice. +*> Further, if A can be written as A = B*D, with well-conditioned B and some +*> diagonal D, then the high accuracy is guaranteed, both theoretically and +*> in software, independent of D. For more details see [1], [2]. +*> The computational range for the singular values can be the full range +*> ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS +*> & LAPACK routines called by DGEJSV are implemented to work in that range. +*> If that is not the case, then the restriction for safe computation with +*> the singular values in the range of normalized IEEE numbers is that the +*> spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not +*> overflow. This code (DGEJSV) is best used in this restricted range, +*> meaning that singular values of magnitude below ||A||_2 / DLAMCH('O') are +*> returned as zeros. See JOBR for details on this. +*> Further, this implementation is somewhat slower than the one described +*> in [1,2] due to replacement of some non-LAPACK components, and because +*> the choice of some tuning parameters in the iterative part (DGESVJ) is +*> left to the implementer on a particular machine. +*> The rank revealing QR factorization (in this code: DGEQP3) should be +*> implemented as in [3]. We have a new version of DGEQP3 under development +*> that is more robust than the current one in LAPACK, with a cleaner cut in +*> rank deficient cases. It will be available in the SIGMA library [4]. +*> If M is much larger than N, it is obvious that the initial QRF with +*> column pivoting can be preprocessed by the QRF without pivoting. That +*> well known trick is not used in DGEJSV because in some cases heavy row +*> weighting can be treated with complete pivoting. The overhead in cases +*> M much larger than N is then only due to pivoting, but the benefits in +*> terms of accuracy have prevailed. The implementer/user can incorporate +*> this extra QRF step easily. The implementer can also improve data movement +*> (matrix transpose, matrix copy, matrix transposed copy) - this +*> implementation of DGEJSV uses only the simplest, naive data movement. +*> \endverbatim +* +*> \par Contributors: +* ================== +*> +*> Zlatko Drmac (Zagreb, Croatia) and Kresimir Veselic (Hagen, Germany) +* +*> \par References: +* ================ +*> +*> \verbatim +*> +*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342. +*> LAPACK Working note 169. +*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362. +*> LAPACK Working note 170. +*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR +*> factorization software - a case study. +*> ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28. +*> LAPACK Working note 176. +*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV, +*> QSVD, (H,K)-SVD computations. +*> Department of Mathematics, University of Zagreb, 2008. +*> \endverbatim +* +*> \par Bugs, examples and comments: +* ================================= +*> +*> Please report all bugs and send interesting examples and/or comments to +*> drmac@math.hr. Thank you. +*> +* ===================================================================== + SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, + $ M, N, A, LDA, SVA, U, LDU, V, LDV, + $ WORK, LWORK, IWORK, INFO ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + IMPLICIT NONE + INTEGER INFO, LDA, LDU, LDV, LWORK, M, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), SVA( N ), U( LDU, * ), V( LDV, * ), + $ WORK( LWORK ) + INTEGER IWORK( * ) + CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* =========================================================================== +* +* .. Local Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION AAPP, AAQQ, AATMAX, AATMIN, BIG, BIG1, COND_OK, + $ CONDR1, CONDR2, ENTRA, ENTRAT, EPSLN, MAXPRJ, SCALEM, + $ SCONDA, SFMIN, SMALL, TEMP1, USCAL1, USCAL2, XSC + INTEGER IERR, N1, NR, NUMRANK, p, q, WARNING + LOGICAL ALMORT, DEFR, ERREST, GOSCAL, JRACC, KILL, LSVEC, + $ L2ABER, L2KILL, L2PERT, L2RANK, L2TRAN, + $ NOSCAL, ROWPIV, RSVEC, TRANSP +* .. +* .. Intrinsic Functions .. + INTRINSIC DABS, DLOG, MAX, MIN, DBLE, IDNINT, DSIGN, DSQRT +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DNRM2 + INTEGER IDAMAX + LOGICAL LSAME + EXTERNAL IDAMAX, LSAME, DLAMCH, DNRM2 +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DGELQF, DGEQP3, DGEQRF, DLACPY, DLASCL, + $ DLASET, DLASSQ, DLASWP, DORGQR, DORMLQ, + $ DORMQR, DPOCON, DSCAL, DSWAP, DTRSM, XERBLA +* + EXTERNAL DGESVJ +* .. +* +* Test the input arguments +* + LSVEC = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' ) + JRACC = LSAME( JOBV, 'J' ) + RSVEC = LSAME( JOBV, 'V' ) .OR. JRACC + ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' ) + L2RANK = LSAME( JOBA, 'R' ) + L2ABER = LSAME( JOBA, 'A' ) + ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' ) + L2TRAN = LSAME( JOBT, 'T' ) + L2KILL = LSAME( JOBR, 'R' ) + DEFR = LSAME( JOBR, 'N' ) + L2PERT = LSAME( JOBP, 'P' ) +* + IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR. + $ ERREST .OR. LSAME( JOBA, 'C' ) )) THEN + INFO = - 1 + ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR. + $ LSAME( JOBU, 'W' )) ) THEN + INFO = - 2 + ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR. + $ LSAME( JOBV, 'W' )) .OR. ( JRACC .AND. (.NOT.LSVEC) ) ) THEN + INFO = - 3 + ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) ) THEN + INFO = - 4 + ELSE IF ( .NOT. ( L2TRAN .OR. LSAME( JOBT, 'N' ) ) ) THEN + INFO = - 5 + ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN + INFO = - 6 + ELSE IF ( M .LT. 0 ) THEN + INFO = - 7 + ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN + INFO = - 8 + ELSE IF ( LDA .LT. M ) THEN + INFO = - 10 + ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN + INFO = - 13 + ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN + INFO = - 15 + ELSE IF ( (.NOT.(LSVEC .OR. RSVEC .OR. ERREST).AND. + & (LWORK .LT. MAX(7,4*N+1,2*M+N))) .OR. + & (.NOT.(LSVEC .OR. RSVEC) .AND. ERREST .AND. + & (LWORK .LT. MAX(7,4*N+N*N,2*M+N))) .OR. + & (LSVEC .AND. (.NOT.RSVEC) .AND. (LWORK .LT. MAX(7,2*M+N,4*N+1))) + & .OR. + & (RSVEC .AND. (.NOT.LSVEC) .AND. (LWORK .LT. MAX(7,2*M+N,4*N+1))) + & .OR. + & (LSVEC .AND. RSVEC .AND. (.NOT.JRACC) .AND. + & (LWORK.LT.MAX(2*M+N,6*N+2*N*N))) + & .OR. (LSVEC .AND. RSVEC .AND. JRACC .AND. + & LWORK.LT.MAX(2*M+N,4*N+N*N,2*N+N*N+6))) + & THEN + INFO = - 17 + ELSE +* #:) + INFO = 0 + END IF +* + IF ( INFO .NE. 0 ) THEN +* #:( + CALL XERBLA( 'DGEJSV', - INFO ) + RETURN + END IF +* +* Quick return for void matrix (Y3K safe) +* #:) + IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN + IWORK(1:3) = 0 + WORK(1:7) = 0 + RETURN + ENDIF +* +* Determine whether the matrix U should be M x N or M x M +* + IF ( LSVEC ) THEN + N1 = N + IF ( LSAME( JOBU, 'F' ) ) N1 = M + END IF +* +* Set numerical parameters +* +*! NOTE: Make sure DLAMCH() does not fail on the target architecture. +* + EPSLN = DLAMCH('Epsilon') + SFMIN = DLAMCH('SafeMinimum') + SMALL = SFMIN / EPSLN + BIG = DLAMCH('O') +* BIG = ONE / SFMIN +* +* Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N +* +*(!) If necessary, scale SVA() to protect the largest norm from +* overflow. It is possible that this scaling pushes the smallest +* column norm left from the underflow threshold (extreme case). +* + SCALEM = ONE / DSQRT(DBLE(M)*DBLE(N)) + NOSCAL = .TRUE. + GOSCAL = .TRUE. + DO 1874 p = 1, N + AAPP = ZERO + AAQQ = ONE + CALL DLASSQ( M, A(1,p), 1, AAPP, AAQQ ) + IF ( AAPP .GT. BIG ) THEN + INFO = - 9 + CALL XERBLA( 'DGEJSV', -INFO ) + RETURN + END IF + AAQQ = DSQRT(AAQQ) + IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL ) THEN + SVA(p) = AAPP * AAQQ + ELSE + NOSCAL = .FALSE. + SVA(p) = AAPP * ( AAQQ * SCALEM ) + IF ( GOSCAL ) THEN + GOSCAL = .FALSE. + CALL DSCAL( p-1, SCALEM, SVA, 1 ) + END IF + END IF + 1874 CONTINUE +* + IF ( NOSCAL ) SCALEM = ONE +* + AAPP = ZERO + AAQQ = BIG + DO 4781 p = 1, N + AAPP = MAX( AAPP, SVA(p) ) + IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) ) + 4781 CONTINUE +* +* Quick return for zero M x N matrix +* #:) + IF ( AAPP .EQ. ZERO ) THEN + IF ( LSVEC ) CALL DLASET( 'G', M, N1, ZERO, ONE, U, LDU ) + IF ( RSVEC ) CALL DLASET( 'G', N, N, ZERO, ONE, V, LDV ) + WORK(1) = ONE + WORK(2) = ONE + IF ( ERREST ) WORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + WORK(4) = ONE + WORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + WORK(6) = ZERO + WORK(7) = ZERO + END IF + IWORK(1) = 0 + IWORK(2) = 0 + IWORK(3) = 0 + RETURN + END IF +* +* Issue warning if denormalized column norms detected. Override the +* high relative accuracy request. Issue licence to kill columns +* (set them to zero) whose norm is less than sigma_max / BIG (roughly). +* #:( + WARNING = 0 + IF ( AAQQ .LE. SFMIN ) THEN + L2RANK = .TRUE. + L2KILL = .TRUE. + WARNING = 1 + END IF +* +* Quick return for one-column matrix +* #:) + IF ( N .EQ. 1 ) THEN +* + IF ( LSVEC ) THEN + CALL DLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR ) + CALL DLACPY( 'A', M, 1, A, LDA, U, LDU ) +* computing all M left singular vectors of the M x 1 matrix + IF ( N1 .NE. N ) THEN + CALL DGEQRF( M, N, U,LDU, WORK, WORK(N+1),LWORK-N,IERR ) + CALL DORGQR( M,N1,1, U,LDU,WORK,WORK(N+1),LWORK-N,IERR ) + CALL DCOPY( M, A(1,1), 1, U(1,1), 1 ) + END IF + END IF + IF ( RSVEC ) THEN + V(1,1) = ONE + END IF + IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN + SVA(1) = SVA(1) / SCALEM + SCALEM = ONE + END IF + WORK(1) = ONE / SCALEM + WORK(2) = ONE + IF ( SVA(1) .NE. ZERO ) THEN + IWORK(1) = 1 + IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN + IWORK(2) = 1 + ELSE + IWORK(2) = 0 + END IF + ELSE + IWORK(1) = 0 + IWORK(2) = 0 + END IF + IWORK(3) = 0 + IF ( ERREST ) WORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + WORK(4) = ONE + WORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + WORK(6) = ZERO + WORK(7) = ZERO + END IF + RETURN +* + END IF +* + TRANSP = .FALSE. + L2TRAN = L2TRAN .AND. ( M .EQ. N ) +* + AATMAX = -ONE + AATMIN = BIG + IF ( ROWPIV .OR. L2TRAN ) THEN +* +* Compute the row norms, needed to determine row pivoting sequence +* (in the case of heavily row weighted A, row pivoting is strongly +* advised) and to collect information needed to compare the +* structures of A * A^t and A^t * A (in the case L2TRAN.EQ..TRUE.). +* + IF ( L2TRAN ) THEN + DO 1950 p = 1, M + XSC = ZERO + TEMP1 = ONE + CALL DLASSQ( N, A(p,1), LDA, XSC, TEMP1 ) +* DLASSQ gets both the ell_2 and the ell_infinity norm +* in one pass through the vector + WORK(M+N+p) = XSC * SCALEM + WORK(N+p) = XSC * (SCALEM*DSQRT(TEMP1)) + AATMAX = MAX( AATMAX, WORK(N+p) ) + IF (WORK(N+p) .NE. ZERO) AATMIN = MIN(AATMIN,WORK(N+p)) + 1950 CONTINUE + ELSE + DO 1904 p = 1, M + WORK(M+N+p) = SCALEM*DABS( A(p,IDAMAX(N,A(p,1),LDA)) ) + AATMAX = MAX( AATMAX, WORK(M+N+p) ) + AATMIN = MIN( AATMIN, WORK(M+N+p) ) + 1904 CONTINUE + END IF +* + END IF +* +* For square matrix A try to determine whether A^t would be better +* input for the preconditioned Jacobi SVD, with faster convergence. +* The decision is based on an O(N) function of the vector of column +* and row norms of A, based on the Shannon entropy. This should give +* the right choice in most cases when the difference actually matters. +* It may fail and pick the slower converging side. +* + ENTRA = ZERO + ENTRAT = ZERO + IF ( L2TRAN ) THEN +* + XSC = ZERO + TEMP1 = ONE + CALL DLASSQ( N, SVA, 1, XSC, TEMP1 ) + TEMP1 = ONE / TEMP1 +* + ENTRA = ZERO + DO 1113 p = 1, N + BIG1 = ( ( SVA(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * DLOG(BIG1) + 1113 CONTINUE + ENTRA = - ENTRA / DLOG(DBLE(N)) +* +* Now, SVA().^2/Trace(A^t * A) is a point in the probability simplex. +* It is derived from the diagonal of A^t * A. Do the same with the +* diagonal of A * A^t, compute the entropy of the corresponding +* probability distribution. Note that A * A^t and A^t * A have the +* same trace. +* + ENTRAT = ZERO + DO 1114 p = N+1, N+M + BIG1 = ( ( WORK(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * DLOG(BIG1) + 1114 CONTINUE + ENTRAT = - ENTRAT / DLOG(DBLE(M)) +* +* Analyze the entropies and decide A or A^t. Smaller entropy +* usually means better input for the algorithm. +* + TRANSP = ( ENTRAT .LT. ENTRA ) +* +* If A^t is better than A, transpose A. +* + IF ( TRANSP ) THEN +* In an optimal implementation, this trivial transpose +* should be replaced with faster transpose. + DO 1115 p = 1, N - 1 + DO 1116 q = p + 1, N + TEMP1 = A(q,p) + A(q,p) = A(p,q) + A(p,q) = TEMP1 + 1116 CONTINUE + 1115 CONTINUE + DO 1117 p = 1, N + WORK(M+N+p) = SVA(p) + SVA(p) = WORK(N+p) + 1117 CONTINUE + TEMP1 = AAPP + AAPP = AATMAX + AATMAX = TEMP1 + TEMP1 = AAQQ + AAQQ = AATMIN + AATMIN = TEMP1 + KILL = LSVEC + LSVEC = RSVEC + RSVEC = KILL + IF ( LSVEC ) N1 = N +* + ROWPIV = .TRUE. + END IF +* + END IF +* END IF L2TRAN +* +* Scale the matrix so that its maximal singular value remains less +* than DSQRT(BIG) -- the matrix is scaled so that its maximal column +* has Euclidean norm equal to DSQRT(BIG/N). The only reason to keep +* DSQRT(BIG) instead of BIG is the fact that DGEJSV uses LAPACK and +* BLAS routines that, in some implementations, are not capable of +* working in the full interval [SFMIN,BIG] and that they may provoke +* overflows in the intermediate results. If the singular values spread +* from SFMIN to BIG, then DGESVJ will compute them. So, in that case, +* one should use DGESVJ instead of DGEJSV. +* + BIG1 = DSQRT( BIG ) + TEMP1 = DSQRT( BIG / DBLE(N) ) +* + CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR ) + IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN + AAQQ = ( AAQQ / AAPP ) * TEMP1 + ELSE + AAQQ = ( AAQQ * TEMP1 ) / AAPP + END IF + TEMP1 = TEMP1 * SCALEM + CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR ) +* +* To undo scaling at the end of this procedure, multiply the +* computed singular values with USCAL2 / USCAL1. +* + USCAL1 = TEMP1 + USCAL2 = AAPP +* + IF ( L2KILL ) THEN +* L2KILL enforces computation of nonzero singular values in +* the restricted range of condition number of the initial A, +* sigma_max(A) / sigma_min(A) approx. DSQRT(BIG)/DSQRT(SFMIN). + XSC = DSQRT( SFMIN ) + ELSE + XSC = SMALL +* +* Now, if the condition number of A is too big, +* sigma_max(A) / sigma_min(A) .GT. DSQRT(BIG/N) * EPSLN / SFMIN, +* as a precaution measure, the full SVD is computed using DGESVJ +* with accumulated Jacobi rotations. This provides numerically +* more robust computation, at the cost of slightly increased run +* time. Depending on the concrete implementation of BLAS and LAPACK +* (i.e. how they behave in presence of extreme ill-conditioning) the +* implementor may decide to remove this switch. + IF ( ( AAQQ.LT.DSQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN + JRACC = .TRUE. + END IF +* + END IF + IF ( AAQQ .LT. XSC ) THEN + DO 700 p = 1, N + IF ( SVA(p) .LT. XSC ) THEN + CALL DLASET( 'A', M, 1, ZERO, ZERO, A(1,p), LDA ) + SVA(p) = ZERO + END IF + 700 CONTINUE + END IF +* +* Preconditioning using QR factorization with pivoting +* + IF ( ROWPIV ) THEN +* Optional row permutation (Bjoerck row pivoting): +* A result by Cox and Higham shows that the Bjoerck's +* row pivoting combined with standard column pivoting +* has similar effect as Powell-Reid complete pivoting. +* The ell-infinity norms of A are made nonincreasing. + DO 1952 p = 1, M - 1 + q = IDAMAX( M-p+1, WORK(M+N+p), 1 ) + p - 1 + IWORK(2*N+p) = q + IF ( p .NE. q ) THEN + TEMP1 = WORK(M+N+p) + WORK(M+N+p) = WORK(M+N+q) + WORK(M+N+q) = TEMP1 + END IF + 1952 CONTINUE + CALL DLASWP( N, A, LDA, 1, M-1, IWORK(2*N+1), 1 ) + END IF +* +* End of the preparation phase (scaling, optional sorting and +* transposing, optional flushing of small columns). +* +* Preconditioning +* +* If the full SVD is needed, the right singular vectors are computed +* from a matrix equation, and for that we need theoretical analysis +* of the Businger-Golub pivoting. So we use DGEQP3 as the first RR QRF. +* In all other cases the first RR QRF can be chosen by other criteria +* (eg speed by replacing global with restricted window pivoting, such +* as in SGEQPX from TOMS # 782). Good results will be obtained using +* SGEQPX with properly (!) chosen numerical parameters. +* Any improvement of DGEQP3 improves overall performance of DGEJSV. +* +* A * P1 = Q1 * [ R1^t 0]^t: + DO 1963 p = 1, N +* .. all columns are free columns + IWORK(p) = 0 + 1963 CONTINUE + CALL DGEQP3( M,N,A,LDA, IWORK,WORK, WORK(N+1),LWORK-N, IERR ) +* +* The upper triangular matrix R1 from the first QRF is inspected for +* rank deficiency and possibilities for deflation, or possible +* ill-conditioning. Depending on the user specified flag L2RANK, +* the procedure explores possibilities to reduce the numerical +* rank by inspecting the computed upper triangular factor. If +* L2RANK or L2ABER are up, then DGEJSV will compute the SVD of +* A + dA, where ||dA|| <= f(M,N)*EPSLN. +* + NR = 1 + IF ( L2ABER ) THEN +* Standard absolute error bound suffices. All sigma_i with +* sigma_i < N*EPSLN*||A|| are flushed to zero. This is an +* aggressive enforcement of lower numerical rank by introducing a +* backward error of the order of N*EPSLN*||A||. + TEMP1 = DSQRT(DBLE(N))*EPSLN + DO 3001 p = 2, N + IF ( DABS(A(p,p)) .GE. (TEMP1*DABS(A(1,1))) ) THEN + NR = NR + 1 + ELSE + GO TO 3002 + END IF + 3001 CONTINUE + 3002 CONTINUE + ELSE IF ( L2RANK ) THEN +* .. similarly as above, only slightly more gentle (less aggressive). +* Sudden drop on the diagonal of R1 is used as the criterion for +* close-to-rank-deficient. + TEMP1 = DSQRT(SFMIN) + DO 3401 p = 2, N + IF ( ( DABS(A(p,p)) .LT. (EPSLN*DABS(A(p-1,p-1))) ) .OR. + $ ( DABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (DABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402 + NR = NR + 1 + 3401 CONTINUE + 3402 CONTINUE +* + ELSE +* The goal is high relative accuracy. However, if the matrix +* has high scaled condition number the relative accuracy is in +* general not feasible. Later on, a condition number estimator +* will be deployed to estimate the scaled condition number. +* Here we just remove the underflowed part of the triangular +* factor. This prevents the situation in which the code is +* working hard to get the accuracy not warranted by the data. + TEMP1 = DSQRT(SFMIN) + DO 3301 p = 2, N + IF ( ( DABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (DABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302 + NR = NR + 1 + 3301 CONTINUE + 3302 CONTINUE +* + END IF +* + ALMORT = .FALSE. + IF ( NR .EQ. N ) THEN + MAXPRJ = ONE + DO 3051 p = 2, N + TEMP1 = DABS(A(p,p)) / SVA(IWORK(p)) + MAXPRJ = MIN( MAXPRJ, TEMP1 ) + 3051 CONTINUE + IF ( MAXPRJ**2 .GE. ONE - DBLE(N)*EPSLN ) ALMORT = .TRUE. + END IF +* +* + SCONDA = - ONE + CONDR1 = - ONE + CONDR2 = - ONE +* + IF ( ERREST ) THEN + IF ( N .EQ. NR ) THEN + IF ( RSVEC ) THEN +* .. V is available as workspace + CALL DLACPY( 'U', N, N, A, LDA, V, LDV ) + DO 3053 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL DSCAL( p, ONE/TEMP1, V(1,p), 1 ) + 3053 CONTINUE + CALL DPOCON( 'U', N, V, LDV, ONE, TEMP1, + $ WORK(N+1), IWORK(2*N+M+1), IERR ) + ELSE IF ( LSVEC ) THEN +* .. U is available as workspace + CALL DLACPY( 'U', N, N, A, LDA, U, LDU ) + DO 3054 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL DSCAL( p, ONE/TEMP1, U(1,p), 1 ) + 3054 CONTINUE + CALL DPOCON( 'U', N, U, LDU, ONE, TEMP1, + $ WORK(N+1), IWORK(2*N+M+1), IERR ) + ELSE + CALL DLACPY( 'U', N, N, A, LDA, WORK(N+1), N ) + DO 3052 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL DSCAL( p, ONE/TEMP1, WORK(N+(p-1)*N+1), 1 ) + 3052 CONTINUE +* .. the columns of R are scaled to have unit Euclidean lengths. + CALL DPOCON( 'U', N, WORK(N+1), N, ONE, TEMP1, + $ WORK(N+N*N+1), IWORK(2*N+M+1), IERR ) + END IF + SCONDA = ONE / DSQRT(TEMP1) +* SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1). +* N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA + ELSE + SCONDA = - ONE + END IF + END IF +* + L2PERT = L2PERT .AND. ( DABS( A(1,1)/A(NR,NR) ) .GT. DSQRT(BIG1) ) +* If there is no violent scaling, artificial perturbation is not needed. +* +* Phase 3: +* + IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN +* +* Singular Values only +* +* .. transpose A(1:NR,1:N) + DO 1946 p = 1, MIN( N-1, NR ) + CALL DCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 ) + 1946 CONTINUE +* +* The following two DO-loops introduce small relative perturbation +* into the strict upper triangle of the lower triangular matrix. +* Small entries below the main diagonal are also changed. +* This modification is useful if the computing environment does not +* provide/allow FLUSH TO ZERO underflow, for it prevents many +* annoying denormalized numbers in case of strongly scaled matrices. +* The perturbation is structured so that it does not introduce any +* new perturbation of the singular values, and it does not destroy +* the job done by the preconditioner. +* The licence for this perturbation is in the variable L2PERT, which +* should be .FALSE. if FLUSH TO ZERO underflow is active. +* + IF ( .NOT. ALMORT ) THEN +* + IF ( L2PERT ) THEN +* XSC = DSQRT(SMALL) + XSC = EPSLN / DBLE(N) + DO 4947 q = 1, NR + TEMP1 = XSC*DABS(A(q,q)) + DO 4949 p = 1, N + IF ( ( (p.GT.q) .AND. (DABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) + $ A(p,q) = DSIGN( TEMP1, A(p,q) ) + 4949 CONTINUE + 4947 CONTINUE + ELSE + CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, A(1,2),LDA ) + END IF +* +* .. second preconditioning using the QR factorization +* + CALL DGEQRF( N,NR, A,LDA, WORK, WORK(N+1),LWORK-N, IERR ) +* +* .. and transpose upper to lower triangular + DO 1948 p = 1, NR - 1 + CALL DCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 ) + 1948 CONTINUE +* + END IF +* +* Row-cyclic Jacobi SVD algorithm with column pivoting +* +* .. again some perturbation (a "background noise") is added +* to drown denormals + IF ( L2PERT ) THEN +* XSC = DSQRT(SMALL) + XSC = EPSLN / DBLE(N) + DO 1947 q = 1, NR + TEMP1 = XSC*DABS(A(q,q)) + DO 1949 p = 1, NR + IF ( ( (p.GT.q) .AND. (DABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) + $ A(p,q) = DSIGN( TEMP1, A(p,q) ) + 1949 CONTINUE + 1947 CONTINUE + ELSE + CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, A(1,2), LDA ) + END IF +* +* .. and one-sided Jacobi rotations are started on a lower +* triangular matrix (plus perturbation which is ignored in +* the part which destroys triangular form (confusing?!)) +* + CALL DGESVJ( 'L', 'NoU', 'NoV', NR, NR, A, LDA, SVA, + $ N, V, LDV, WORK, LWORK, INFO ) +* + SCALEM = WORK(1) + NUMRANK = IDNINT(WORK(2)) +* +* + ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN +* +* -> Singular Values and Right Singular Vectors <- +* + IF ( ALMORT ) THEN +* +* .. in this case NR equals N + DO 1998 p = 1, NR + CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + 1998 CONTINUE + CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV ) +* + CALL DGESVJ( 'L','U','N', N, NR, V,LDV, SVA, NR, A,LDA, + $ WORK, LWORK, INFO ) + SCALEM = WORK(1) + NUMRANK = IDNINT(WORK(2)) + + ELSE +* +* .. two more QR factorizations ( one QRF is not enough, two require +* accumulated product of Jacobi rotations, three are perfect ) +* + CALL DLASET( 'Lower', NR-1, NR-1, ZERO, ZERO, A(2,1), LDA ) + CALL DGELQF( NR, N, A, LDA, WORK, WORK(N+1), LWORK-N, IERR) + CALL DLACPY( 'Lower', NR, NR, A, LDA, V, LDV ) + CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV ) + CALL DGEQRF( NR, NR, V, LDV, WORK(N+1), WORK(2*N+1), + $ LWORK-2*N, IERR ) + DO 8998 p = 1, NR + CALL DCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 ) + 8998 CONTINUE + CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV ) +* + CALL DGESVJ( 'Lower', 'U','N', NR, NR, V,LDV, SVA, NR, U, + $ LDU, WORK(N+1), LWORK, INFO ) + SCALEM = WORK(N+1) + NUMRANK = IDNINT(WORK(N+2)) + IF ( NR .LT. N ) THEN + CALL DLASET( 'A',N-NR, NR, ZERO,ZERO, V(NR+1,1), LDV ) + CALL DLASET( 'A',NR, N-NR, ZERO,ZERO, V(1,NR+1), LDV ) + CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE, V(NR+1,NR+1), LDV ) + END IF +* + CALL DORMLQ( 'Left', 'Transpose', N, N, NR, A, LDA, WORK, + $ V, LDV, WORK(N+1), LWORK-N, IERR ) +* + END IF +* + DO 8991 p = 1, N + CALL DCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA ) + 8991 CONTINUE + CALL DLACPY( 'All', N, N, A, LDA, V, LDV ) +* + IF ( TRANSP ) THEN + CALL DLACPY( 'All', N, N, V, LDV, U, LDU ) + END IF +* + ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN +* +* .. Singular Values and Left Singular Vectors .. +* +* .. second preconditioning step to avoid need to accumulate +* Jacobi rotations in the Jacobi iterations. + DO 1965 p = 1, NR + CALL DCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 ) + 1965 CONTINUE + CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU ) +* + CALL DGEQRF( N, NR, U, LDU, WORK(N+1), WORK(2*N+1), + $ LWORK-2*N, IERR ) +* + DO 1967 p = 1, NR - 1 + CALL DCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 ) + 1967 CONTINUE + CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU ) +* + CALL DGESVJ( 'Lower', 'U', 'N', NR,NR, U, LDU, SVA, NR, A, + $ LDA, WORK(N+1), LWORK-N, INFO ) + SCALEM = WORK(N+1) + NUMRANK = IDNINT(WORK(N+2)) +* + IF ( NR .LT. M ) THEN + CALL DLASET( 'A', M-NR, NR,ZERO, ZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL DLASET( 'A',NR, N1-NR, ZERO, ZERO, U(1,NR+1), LDU ) + CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE,U(NR+1,NR+1), LDU ) + END IF + END IF +* + CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 ) +* + DO 1974 p = 1, N1 + XSC = ONE / DNRM2( M, U(1,p), 1 ) + CALL DSCAL( M, XSC, U(1,p), 1 ) + 1974 CONTINUE +* + IF ( TRANSP ) THEN + CALL DLACPY( 'All', N, N, U, LDU, V, LDV ) + END IF +* + ELSE +* +* .. Full SVD .. +* + IF ( .NOT. JRACC ) THEN +* + IF ( .NOT. ALMORT ) THEN +* +* Second Preconditioning Step (QRF [with pivoting]) +* Note that the composition of TRANSPOSE, QRF and TRANSPOSE is +* equivalent to an LQF CALL. Since in many libraries the QRF +* seems to be better optimized than the LQF, we do explicit +* transpose and use the QRF. This is subject to changes in an +* optimized implementation of DGEJSV. +* + DO 1968 p = 1, NR + CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + 1968 CONTINUE +* +* .. the following two loops perturb small entries to avoid +* denormals in the second QR factorization, where they are +* as good as zeros. This is done to avoid painfully slow +* computation with denormals. The relative size of the perturbation +* is a parameter that can be changed by the implementer. +* This perturbation device will be obsolete on machines with +* properly implemented arithmetic. +* To switch it off, set L2PERT=.FALSE. To remove it from the +* code, remove the action under L2PERT=.TRUE., leave the ELSE part. +* The following two loops should be blocked and fused with the +* transposed copy above. +* + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL) + DO 2969 q = 1, NR + TEMP1 = XSC*DABS( V(q,q) ) + DO 2968 p = 1, N + IF ( ( p .GT. q ) .AND. ( DABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) + $ V(p,q) = DSIGN( TEMP1, V(p,q) ) + IF ( p .LT. q ) V(p,q) = - V(p,q) + 2968 CONTINUE + 2969 CONTINUE + ELSE + CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV ) + END IF +* +* Estimate the row scaled condition number of R1 +* (If R1 is rectangular, N > NR, then the condition number +* of the leading NR x NR submatrix is estimated.) +* + CALL DLACPY( 'L', NR, NR, V, LDV, WORK(2*N+1), NR ) + DO 3950 p = 1, NR + TEMP1 = DNRM2(NR-p+1,WORK(2*N+(p-1)*NR+p),1) + CALL DSCAL(NR-p+1,ONE/TEMP1,WORK(2*N+(p-1)*NR+p),1) + 3950 CONTINUE + CALL DPOCON('Lower',NR,WORK(2*N+1),NR,ONE,TEMP1, + $ WORK(2*N+NR*NR+1),IWORK(M+2*N+1),IERR) + CONDR1 = ONE / DSQRT(TEMP1) +* .. here need a second opinion on the condition number +* .. then assume worst case scenario +* R1 is OK for inverse <=> CONDR1 .LT. DBLE(N) +* more conservative <=> CONDR1 .LT. DSQRT(DBLE(N)) +* + COND_OK = DSQRT(DBLE(NR)) +*[TP] COND_OK is a tuning parameter. + + IF ( CONDR1 .LT. COND_OK ) THEN +* .. the second QRF without pivoting. Note: in an optimized +* implementation, this QRF should be implemented as the QRF +* of a lower triangular matrix. +* R1^t = Q2 * R2 + CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1), + $ LWORK-2*N, IERR ) +* + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL)/EPSLN + DO 3959 p = 2, NR + DO 3958 q = 1, p - 1 + TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q))) + IF ( DABS(V(q,p)) .LE. TEMP1 ) + $ V(q,p) = DSIGN( TEMP1, V(q,p) ) + 3958 CONTINUE + 3959 CONTINUE + END IF +* + IF ( NR .NE. N ) + $ CALL DLACPY( 'A', N, NR, V, LDV, WORK(2*N+1), N ) +* .. save ... +* +* .. this transposed copy should be better than naive + DO 1969 p = 1, NR - 1 + CALL DCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 ) + 1969 CONTINUE +* + CONDR2 = CONDR1 +* + ELSE +* +* .. ill-conditioned case: second QRF with pivoting +* Note that windowed pivoting would be equally good +* numerically, and more run-time efficient. So, in +* an optimal implementation, the next call to DGEQP3 +* should be replaced with eg. CALL SGEQPX (ACM TOMS #782) +* with properly (carefully) chosen parameters. +* +* R1^t * P2 = Q2 * R2 + DO 3003 p = 1, NR + IWORK(N+p) = 0 + 3003 CONTINUE + CALL DGEQP3( N, NR, V, LDV, IWORK(N+1), WORK(N+1), + $ WORK(2*N+1), LWORK-2*N, IERR ) +** CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1), +** $ LWORK-2*N, IERR ) + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL) + DO 3969 p = 2, NR + DO 3968 q = 1, p - 1 + TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q))) + IF ( DABS(V(q,p)) .LE. TEMP1 ) + $ V(q,p) = DSIGN( TEMP1, V(q,p) ) + 3968 CONTINUE + 3969 CONTINUE + END IF +* + CALL DLACPY( 'A', N, NR, V, LDV, WORK(2*N+1), N ) +* + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL) + DO 8970 p = 2, NR + DO 8971 q = 1, p - 1 + TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q))) + V(p,q) = - DSIGN( TEMP1, V(q,p) ) + 8971 CONTINUE + 8970 CONTINUE + ELSE + CALL DLASET( 'L',NR-1,NR-1,ZERO,ZERO,V(2,1),LDV ) + END IF +* Now, compute R2 = L3 * Q3, the LQ factorization. + CALL DGELQF( NR, NR, V, LDV, WORK(2*N+N*NR+1), + $ WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR ) +* .. and estimate the condition number + CALL DLACPY( 'L',NR,NR,V,LDV,WORK(2*N+N*NR+NR+1),NR ) + DO 4950 p = 1, NR + TEMP1 = DNRM2( p, WORK(2*N+N*NR+NR+p), NR ) + CALL DSCAL( p, ONE/TEMP1, WORK(2*N+N*NR+NR+p), NR ) + 4950 CONTINUE + CALL DPOCON( 'L',NR,WORK(2*N+N*NR+NR+1),NR,ONE,TEMP1, + $ WORK(2*N+N*NR+NR+NR*NR+1),IWORK(M+2*N+1),IERR ) + CONDR2 = ONE / DSQRT(TEMP1) +* + IF ( CONDR2 .GE. COND_OK ) THEN +* .. save the Householder vectors used for Q3 +* (this overwrites the copy of R2, as it will not be +* needed in this branch, but it does not overwritte the +* Huseholder vectors of Q2.). + CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N ) +* .. and the rest of the information on Q3 is in +* WORK(2*N+N*NR+1:2*N+N*NR+N) + END IF +* + END IF +* + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL) + DO 4968 q = 2, NR + TEMP1 = XSC * V(q,q) + DO 4969 p = 1, q - 1 +* V(p,q) = - DSIGN( TEMP1, V(q,p) ) + V(p,q) = - DSIGN( TEMP1, V(p,q) ) + 4969 CONTINUE + 4968 CONTINUE + ELSE + CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV ) + END IF +* +* Second preconditioning finished; continue with Jacobi SVD +* The input matrix is lower trinagular. +* +* Recover the right singular vectors as solution of a well +* conditioned triangular matrix equation. +* + IF ( CONDR1 .LT. COND_OK ) THEN +* + CALL DGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, + $ LDU,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,INFO ) + SCALEM = WORK(2*N+N*NR+NR+1) + NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2)) + DO 3970 p = 1, NR + CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL DSCAL( NR, SVA(p), V(1,p), 1 ) + 3970 CONTINUE + +* .. pick the right matrix equation and solve it +* + IF ( NR .EQ. N ) THEN +* :)) .. best case, R1 is inverted. The solution of this matrix +* equation is Q2*V2 = the product of the Jacobi rotations +* used in DGESVJ, premultiplied with the orthogonal matrix +* from the second QR factorization. + CALL DTRSM( 'L','U','N','N', NR,NR,ONE, A,LDA, V,LDV ) + ELSE +* .. R1 is well conditioned, but non-square. Transpose(R2) +* is inverted to get the product of the Jacobi rotations +* used in DGESVJ. The Q-factor from the second QR +* factorization is then built in explicitly. + CALL DTRSM('L','U','T','N',NR,NR,ONE,WORK(2*N+1), + $ N,V,LDV) + IF ( NR .LT. N ) THEN + CALL DLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV) + CALL DLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV) + CALL DLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV) + END IF + CALL DORMQR('L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1), + $ V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR) + END IF +* + ELSE IF ( CONDR2 .LT. COND_OK ) THEN +* +* :) .. the input matrix A is very likely a relative of +* the Kahan matrix :) +* The matrix R2 is inverted. The solution of the matrix equation +* is Q3^T*V3 = the product of the Jacobi rotations (appplied to +* the lower triangular L3 from the LQ factorization of +* R2=L3*Q3), pre-multiplied with the transposed Q3. + CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, + $ LDU, WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO ) + SCALEM = WORK(2*N+N*NR+NR+1) + NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2)) + DO 3870 p = 1, NR + CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL DSCAL( NR, SVA(p), U(1,p), 1 ) + 3870 CONTINUE + CALL DTRSM('L','U','N','N',NR,NR,ONE,WORK(2*N+1),N,U,LDU) +* .. apply the permutation from the second QR factorization + DO 873 q = 1, NR + DO 872 p = 1, NR + WORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 872 CONTINUE + DO 874 p = 1, NR + U(p,q) = WORK(2*N+N*NR+NR+p) + 874 CONTINUE + 873 CONTINUE + IF ( NR .LT. N ) THEN + CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV ) + CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV ) + CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV ) + END IF + CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1), + $ V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) + ELSE +* Last line of defense. +* #:( This is a rather pathological case: no scaled condition +* improvement after two pivoted QR factorizations. Other +* possibility is that the rank revealing QR factorization +* or the condition estimator has failed, or the COND_OK +* is set very close to ONE (which is unnecessary). Normally, +* this branch should never be executed, but in rare cases of +* failure of the RRQR or condition estimator, the last line of +* defense ensures that DGEJSV completes the task. +* Compute the full SVD of L3 using DGESVJ with explicit +* accumulation of Jacobi rotations. + CALL DGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U, + $ LDU, WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO ) + SCALEM = WORK(2*N+N*NR+NR+1) + NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2)) + IF ( NR .LT. N ) THEN + CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV ) + CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV ) + CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV ) + END IF + CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1), + $ V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* + CALL DORMLQ( 'L', 'T', NR, NR, NR, WORK(2*N+1), N, + $ WORK(2*N+N*NR+1), U, LDU, WORK(2*N+N*NR+NR+1), + $ LWORK-2*N-N*NR-NR, IERR ) + DO 773 q = 1, NR + DO 772 p = 1, NR + WORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 772 CONTINUE + DO 774 p = 1, NR + U(p,q) = WORK(2*N+N*NR+NR+p) + 774 CONTINUE + 773 CONTINUE +* + END IF +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = DSQRT(DBLE(N)) * EPSLN + DO 1972 q = 1, N + DO 972 p = 1, N + WORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 972 CONTINUE + DO 973 p = 1, N + V(p,q) = WORK(2*N+N*NR+NR+p) + 973 CONTINUE + XSC = ONE / DNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL DSCAL( N, XSC, V(1,q), 1 ) + 1972 CONTINUE +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). + IF ( NR .LT. M ) THEN + CALL DLASET( 'A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU) + CALL DLASET('A',M-NR,N1-NR,ZERO,ONE,U(NR+1,NR+1),LDU) + END IF + END IF +* +* The Q matrix from the first QRF is built into the left singular +* matrix U. This applies to all cases. +* + CALL DORMQR( 'Left', 'No_Tr', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) + +* The columns of U are normalized. The cost is O(M*N) flops. + TEMP1 = DSQRT(DBLE(M)) * EPSLN + DO 1973 p = 1, NR + XSC = ONE / DNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL DSCAL( M, XSC, U(1,p), 1 ) + 1973 CONTINUE +* +* If the initial QRF is computed with row pivoting, the left +* singular vectors must be adjusted. +* + IF ( ROWPIV ) + $ CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 ) +* + ELSE +* +* .. the initial matrix A has almost orthogonal columns and +* the second QRF is not needed +* + CALL DLACPY( 'Upper', N, N, A, LDA, WORK(N+1), N ) + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL) + DO 5970 p = 2, N + TEMP1 = XSC * WORK( N + (p-1)*N + p ) + DO 5971 q = 1, p - 1 + WORK(N+(q-1)*N+p)=-DSIGN(TEMP1,WORK(N+(p-1)*N+q)) + 5971 CONTINUE + 5970 CONTINUE + ELSE + CALL DLASET( 'Lower',N-1,N-1,ZERO,ZERO,WORK(N+2),N ) + END IF +* + CALL DGESVJ( 'Upper', 'U', 'N', N, N, WORK(N+1), N, SVA, + $ N, U, LDU, WORK(N+N*N+1), LWORK-N-N*N, INFO ) +* + SCALEM = WORK(N+N*N+1) + NUMRANK = IDNINT(WORK(N+N*N+2)) + DO 6970 p = 1, N + CALL DCOPY( N, WORK(N+(p-1)*N+1), 1, U(1,p), 1 ) + CALL DSCAL( N, SVA(p), WORK(N+(p-1)*N+1), 1 ) + 6970 CONTINUE +* + CALL DTRSM( 'Left', 'Upper', 'NoTrans', 'No UD', N, N, + $ ONE, A, LDA, WORK(N+1), N ) + DO 6972 p = 1, N + CALL DCOPY( N, WORK(N+p), N, V(IWORK(p),1), LDV ) + 6972 CONTINUE + TEMP1 = DSQRT(DBLE(N))*EPSLN + DO 6971 p = 1, N + XSC = ONE / DNRM2( N, V(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL DSCAL( N, XSC, V(1,p), 1 ) + 6971 CONTINUE +* +* Assemble the left singular vector matrix U (M x N). +* + IF ( N .LT. M ) THEN + CALL DLASET( 'A', M-N, N, ZERO, ZERO, U(N+1,1), LDU ) + IF ( N .LT. N1 ) THEN + CALL DLASET( 'A',N, N1-N, ZERO, ZERO, U(1,N+1),LDU ) + CALL DLASET( 'A',M-N,N1-N, ZERO, ONE,U(N+1,N+1),LDU ) + END IF + END IF + CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) + TEMP1 = DSQRT(DBLE(M))*EPSLN + DO 6973 p = 1, N1 + XSC = ONE / DNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL DSCAL( M, XSC, U(1,p), 1 ) + 6973 CONTINUE +* + IF ( ROWPIV ) + $ CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 ) +* + END IF +* +* end of the >> almost orthogonal case << in the full SVD +* + ELSE +* +* This branch deploys a preconditioned Jacobi SVD with explicitly +* accumulated rotations. It is included as optional, mainly for +* experimental purposes. It does perform well, and can also be used. +* In this implementation, this branch will be automatically activated +* if the condition number sigma_max(A) / sigma_min(A) is predicted +* to be greater than the overflow threshold. This is because the +* a posteriori computation of the singular vectors assumes robust +* implementation of BLAS and some LAPACK procedures, capable of working +* in presence of extreme values. Since that is not always the case, ... +* + DO 7968 p = 1, NR + CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + 7968 CONTINUE +* + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL/EPSLN) + DO 5969 q = 1, NR + TEMP1 = XSC*DABS( V(q,q) ) + DO 5968 p = 1, N + IF ( ( p .GT. q ) .AND. ( DABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) + $ V(p,q) = DSIGN( TEMP1, V(p,q) ) + IF ( p .LT. q ) V(p,q) = - V(p,q) + 5968 CONTINUE + 5969 CONTINUE + ELSE + CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV ) + END IF + + CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1), + $ LWORK-2*N, IERR ) + CALL DLACPY( 'L', N, NR, V, LDV, WORK(2*N+1), N ) +* + DO 7969 p = 1, NR + CALL DCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 ) + 7969 CONTINUE + + IF ( L2PERT ) THEN + XSC = DSQRT(SMALL/EPSLN) + DO 9970 q = 2, NR + DO 9971 p = 1, q - 1 + TEMP1 = XSC * MIN(DABS(U(p,p)),DABS(U(q,q))) + U(p,q) = - DSIGN( TEMP1, U(q,p) ) + 9971 CONTINUE + 9970 CONTINUE + ELSE + CALL DLASET('U', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU ) + END IF + + CALL DGESVJ( 'G', 'U', 'V', NR, NR, U, LDU, SVA, + $ N, V, LDV, WORK(2*N+N*NR+1), LWORK-2*N-N*NR, INFO ) + SCALEM = WORK(2*N+N*NR+1) + NUMRANK = IDNINT(WORK(2*N+N*NR+2)) + + IF ( NR .LT. N ) THEN + CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV ) + CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV ) + CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV ) + END IF + + CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1), + $ V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = DSQRT(DBLE(N)) * EPSLN + DO 7972 q = 1, N + DO 8972 p = 1, N + WORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 8972 CONTINUE + DO 8973 p = 1, N + V(p,q) = WORK(2*N+N*NR+NR+p) + 8973 CONTINUE + XSC = ONE / DNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL DSCAL( N, XSC, V(1,q), 1 ) + 7972 CONTINUE +* +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). +* + IF ( NR .LT. M ) THEN + CALL DLASET( 'A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL DLASET( 'A',NR, N1-NR, ZERO, ZERO, U(1,NR+1),LDU ) + CALL DLASET( 'A',M-NR,N1-NR, ZERO, ONE,U(NR+1,NR+1),LDU ) + END IF + END IF +* + CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U, + $ LDU, WORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 ) +* +* + END IF + IF ( TRANSP ) THEN +* .. swap U and V because the procedure worked on A^t + DO 6974 p = 1, N + CALL DSWAP( N, U(1,p), 1, V(1,p), 1 ) + 6974 CONTINUE + END IF +* + END IF +* end of the full SVD +* +* Undo scaling, if necessary (and possible) +* + IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN + CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR ) + USCAL1 = ONE + USCAL2 = ONE + END IF +* + IF ( NR .LT. N ) THEN + DO 3004 p = NR+1, N + SVA(p) = ZERO + 3004 CONTINUE + END IF +* + WORK(1) = USCAL2 * SCALEM + WORK(2) = USCAL1 + IF ( ERREST ) WORK(3) = SCONDA + IF ( LSVEC .AND. RSVEC ) THEN + WORK(4) = CONDR1 + WORK(5) = CONDR2 + END IF + IF ( L2TRAN ) THEN + WORK(6) = ENTRA + WORK(7) = ENTRAT + END IF +* + IWORK(1) = NR + IWORK(2) = NUMRANK + IWORK(3) = WARNING +* + RETURN +* .. +* .. END OF DGEJSV +* .. + END +* diff --git a/lapack-netlib/dgesvx.f b/lapack-netlib/dgesvx.f new file mode 100644 index 000000000..f787488dc --- /dev/null +++ b/lapack-netlib/dgesvx.f @@ -0,0 +1,599 @@ +*> \brief DGESVX computes the solution to system of linear equations A * X = B for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGESVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, +* EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, +* WORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS +* DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ), IWORK( * ) +* DOUBLE PRECISION A( LDA, * ), AF( LDAF, * ), B( LDB, * ), +* $ BERR( * ), C( * ), FERR( * ), R( * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGESVX uses the LU factorization to compute the solution to a real +*> system of linear equations +*> A * X = B, +*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = P * L * U, +*> where P is a permutation matrix, L is a unit lower triangular +*> matrix, and U is upper triangular. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AF and IPIV contain the factored form of A. +*> If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> A, AF, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AF and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AF and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations: +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the N-by-N matrix A. If FACT = 'F' and EQUED is +*> not 'N', then A must have been equilibrated by the scaling +*> factors in R and/or C. A is not modified if FACT = 'F' or +*> 'N', or if FACT = 'E' and EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] AF +*> \verbatim +*> AF is DOUBLE PRECISION array, dimension (LDAF,N) +*> If FACT = 'F', then AF is an input argument and on entry +*> contains the factors L and U from the factorization +*> A = P*L*U as computed by DGETRF. If EQUED .ne. 'N', then +*> AF is the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the equilibrated matrix A (see the description of A for +*> the form of the equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAF +*> \verbatim +*> LDAF is INTEGER +*> The leading dimension of the array AF. LDAF >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = P*L*U +*> as computed by DGETRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is DOUBLE PRECISION array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,NRHS) +*> On entry, the N-by-NRHS right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is DOUBLE PRECISION array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is DOUBLE PRECISION +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is DOUBLE PRECISION array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is DOUBLE PRECISION array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,4*N)) +*> On exit, WORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If WORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 WORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization has +*> been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleGEsolve +* +* ===================================================================== + SUBROUTINE DGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, + $ EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, + $ WORK, IWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS + DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ), IWORK( * ) + DOUBLE PRECISION A( LDA, * ), AF( LDAF, * ), B( LDB, * ), + $ BERR( * ), C( * ), FERR( * ), R( * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J + DOUBLE PRECISION AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DLAMCH, DLANGE, DLANTR + EXTERNAL LSAME, DLAMCH, DLANGE, DLANTR +* .. +* .. External Subroutines .. + EXTERNAL DGECON, DGEEQU, DGERFS, DGETRF, DGETRS, DLACPY, + $ DLAQGE, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = DLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -10 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -11 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -12 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -14 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -16 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGESVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL DGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL DLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, + $ EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of A. +* + CALL DLACPY( 'Full', N, N, A, LDA, AF, LDAF ) + CALL DGETRF( N, N, AF, LDAF, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + RPVGRW = DLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF, + $ WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = DLANGE( 'M', N, INFO, A, LDA, WORK ) / RPVGRW + END IF + WORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = DLANGE( NORM, N, N, A, LDA, WORK ) + RPVGRW = DLANTR( 'M', 'U', 'N', N, N, AF, LDAF, WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = DLANGE( 'M', N, N, A, LDA, WORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL DGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, IWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL DLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL DGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL DGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X, + $ LDX, FERR, BERR, WORK, IWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 80 J = 1, NRHS + DO 70 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 70 CONTINUE + 80 CONTINUE + DO 90 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 90 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 120 CONTINUE + END IF +* + WORK( 1 ) = RPVGRW +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.DLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 + RETURN +* +* End of DGESVX +* + END diff --git a/lapack-netlib/sgbsvx.f b/lapack-netlib/sgbsvx.f new file mode 100644 index 000000000..df3a721d9 --- /dev/null +++ b/lapack-netlib/sgbsvx.f @@ -0,0 +1,641 @@ +*> \brief SGBSVX computes the solution to system of linear equations A * X = B for GB matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGBSVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, +* LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, +* RCOND, FERR, BERR, WORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS +* REAL RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ), IWORK( * ) +* REAL AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), +* $ BERR( * ), C( * ), FERR( * ), R( * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGBSVX uses the LU factorization to compute the solution to a real +*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B, +*> where A is a band matrix of order N with KL subdiagonals and KU +*> superdiagonals, and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed by this subroutine: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = L * U, +*> where L is a product of permutation and unit lower triangular +*> matrices with KL subdiagonals, and U is upper triangular with +*> KL+KU superdiagonals. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AFB and IPIV contain the factored form of +*> A. If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> AB, AFB, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AFB and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AFB and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations. +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KL +*> \verbatim +*> KL is INTEGER +*> The number of subdiagonals within the band of A. KL >= 0. +*> \endverbatim +*> +*> \param[in] KU +*> \verbatim +*> KU is INTEGER +*> The number of superdiagonals within the band of A. KU >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is REAL array, dimension (LDAB,N) +*> On entry, the matrix A in band storage, in rows 1 to KL+KU+1. +*> The j-th column of A is stored in the j-th column of the +*> array AB as follows: +*> AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl) +*> +*> If FACT = 'F' and EQUED is not 'N', then A must have been +*> equilibrated by the scaling factors in R and/or C. AB is not +*> modified if FACT = 'F' or 'N', or if FACT = 'E' and +*> EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] AFB +*> \verbatim +*> AFB is REAL array, dimension (LDAFB,N) +*> If FACT = 'F', then AFB is an input argument and on entry +*> contains details of the LU factorization of the band matrix +*> A, as computed by SGBTRF. U is stored as an upper triangular +*> band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1, +*> and the multipliers used during the factorization are stored +*> in rows KL+KU+2 to 2*KL+KU+1. If EQUED .ne. 'N', then AFB is +*> the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AFB is an output argument and on exit +*> returns details of the LU factorization of A. +*> +*> If FACT = 'E', then AFB is an output argument and on exit +*> returns details of the LU factorization of the equilibrated +*> matrix A (see the description of AB for the form of the +*> equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAFB +*> \verbatim +*> LDAFB is INTEGER +*> The leading dimension of the array AFB. LDAFB >= 2*KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = L*U +*> as computed by SGBTRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is REAL array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB,NRHS) +*> On entry, the right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is REAL array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is REAL +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is REAL array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is REAL array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,3*N)) +*> On exit, WORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If WORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 WORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization +*> has been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup realGBsolve +* +* ===================================================================== + SUBROUTINE SGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, + $ LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, + $ RCOND, FERR, BERR, WORK, IWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS + REAL RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ), IWORK( * ) + REAL AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), + $ BERR( * ), C( * ), FERR( * ), R( * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* Moved setting of INFO = N+1 so INFO does not subsequently get +* overwritten. Sven, 17 Mar 05. +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J, J1, J2 + REAL AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + REAL SLAMCH, SLANGB, SLANTB + EXTERNAL LSAME, SLAMCH, SLANGB, SLANTB +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SGBCON, SGBEQU, SGBRFS, SGBTRF, SGBTRS, + $ SLACPY, SLAQGB, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = SLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KL.LT.0 ) THEN + INFO = -4 + ELSE IF( KU.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDAB.LT.KL+KU+1 ) THEN + INFO = -8 + ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN + INFO = -10 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -12 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -13 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -14 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -16 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -18 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGBSVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL SGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL SLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of the band matrix A. +* + DO 70 J = 1, N + J1 = MAX( J-KU, 1 ) + J2 = MIN( J+KL, N ) + CALL SCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1, + $ AFB( KL+KU+1-J+J1, J ), 1 ) + 70 CONTINUE +* + CALL SGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + ANORM = ZERO + DO 90 J = 1, INFO + DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 ) + ANORM = MAX( ANORM, ABS( AB( I, J ) ) ) + 80 CONTINUE + 90 CONTINUE + RPVGRW = SLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ), + $ AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB, + $ WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ANORM / RPVGRW + END IF + WORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = SLANGB( NORM, N, KL, KU, AB, LDAB, WORK ) + RPVGRW = SLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = SLANGB( 'M', N, KL, KU, AB, LDAB, WORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL SGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND, + $ WORK, IWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL SLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL SGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX, + $ INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL SGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV, + $ B, LDB, X, LDX, FERR, BERR, WORK, IWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 120 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 140 J = 1, NRHS + DO 130 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 130 CONTINUE + 140 CONTINUE + DO 150 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 150 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + WORK( 1 ) = RPVGRW + RETURN +* +* End of SGBSVX +* + END diff --git a/lapack-netlib/sgesvx.f b/lapack-netlib/sgesvx.f new file mode 100644 index 000000000..385e626cf --- /dev/null +++ b/lapack-netlib/sgesvx.f @@ -0,0 +1,599 @@ +*> \brief SGESVX computes the solution to system of linear equations A * X = B for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGESVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, +* EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, +* WORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS +* REAL RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ), IWORK( * ) +* REAL A( LDA, * ), AF( LDAF, * ), B( LDB, * ), +* $ BERR( * ), C( * ), FERR( * ), R( * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGESVX uses the LU factorization to compute the solution to a real +*> system of linear equations +*> A * X = B, +*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = P * L * U, +*> where P is a permutation matrix, L is a unit lower triangular +*> matrix, and U is upper triangular. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AF and IPIV contain the factored form of A. +*> If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> A, AF, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AF and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AF and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations: +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> On entry, the N-by-N matrix A. If FACT = 'F' and EQUED is +*> not 'N', then A must have been equilibrated by the scaling +*> factors in R and/or C. A is not modified if FACT = 'F' or +*> 'N', or if FACT = 'E' and EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] AF +*> \verbatim +*> AF is REAL array, dimension (LDAF,N) +*> If FACT = 'F', then AF is an input argument and on entry +*> contains the factors L and U from the factorization +*> A = P*L*U as computed by SGETRF. If EQUED .ne. 'N', then +*> AF is the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the equilibrated matrix A (see the description of A for +*> the form of the equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAF +*> \verbatim +*> LDAF is INTEGER +*> The leading dimension of the array AF. LDAF >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = P*L*U +*> as computed by SGETRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is REAL array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB,NRHS) +*> On entry, the N-by-NRHS right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is REAL array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is REAL +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is REAL array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is REAL array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,4*N)) +*> On exit, WORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If WORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 WORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N) +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization has +*> been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup realGEsolve +* +* ===================================================================== + SUBROUTINE SGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, + $ EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, + $ WORK, IWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS + REAL RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ), IWORK( * ) + REAL A( LDA, * ), AF( LDAF, * ), B( LDB, * ), + $ BERR( * ), C( * ), FERR( * ), R( * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J + REAL AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + REAL SLAMCH, SLANGE, SLANTR + EXTERNAL LSAME, SLAMCH, SLANGE, SLANTR +* .. +* .. External Subroutines .. + EXTERNAL SGECON, SGEEQU, SGERFS, SGETRF, SGETRS, SLACPY, + $ SLAQGE, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = SLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -10 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -11 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -12 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -14 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -16 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGESVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL SGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL SLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, + $ EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of A. +* + CALL SLACPY( 'Full', N, N, A, LDA, AF, LDAF ) + CALL SGETRF( N, N, AF, LDAF, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + RPVGRW = SLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF, + $ WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = SLANGE( 'M', N, INFO, A, LDA, WORK ) / RPVGRW + END IF + WORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = SLANGE( NORM, N, N, A, LDA, WORK ) + RPVGRW = SLANTR( 'M', 'U', 'N', N, N, AF, LDAF, WORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = SLANGE( 'M', N, N, A, LDA, WORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL SGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, IWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL SLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL SGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL SGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X, + $ LDX, FERR, BERR, WORK, IWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 80 J = 1, NRHS + DO 70 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 70 CONTINUE + 80 CONTINUE + DO 90 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 90 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 120 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + WORK( 1 ) = RPVGRW + RETURN +* +* End of SGESVX +* + END diff --git a/lapack-netlib/zgbsvx.f b/lapack-netlib/zgbsvx.f new file mode 100644 index 000000000..871564a81 --- /dev/null +++ b/lapack-netlib/zgbsvx.f @@ -0,0 +1,644 @@ +*> \brief ZGBSVX computes the solution to system of linear equations A * X = B for GB matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGBSVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, +* LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, +* RCOND, FERR, BERR, WORK, RWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS +* DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ) +* DOUBLE PRECISION BERR( * ), C( * ), FERR( * ), R( * ), +* $ RWORK( * ) +* COMPLEX*16 AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGBSVX uses the LU factorization to compute the solution to a complex +*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B, +*> where A is a band matrix of order N with KL subdiagonals and KU +*> superdiagonals, and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed by this subroutine: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = L * U, +*> where L is a product of permutation and unit lower triangular +*> matrices with KL subdiagonals, and U is upper triangular with +*> KL+KU superdiagonals. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AFB and IPIV contain the factored form of +*> A. If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> AB, AFB, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AFB and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AFB and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations. +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] KL +*> \verbatim +*> KL is INTEGER +*> The number of subdiagonals within the band of A. KL >= 0. +*> \endverbatim +*> +*> \param[in] KU +*> \verbatim +*> KU is INTEGER +*> The number of superdiagonals within the band of A. KU >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] AB +*> \verbatim +*> AB is COMPLEX*16 array, dimension (LDAB,N) +*> On entry, the matrix A in band storage, in rows 1 to KL+KU+1. +*> The j-th column of A is stored in the j-th column of the +*> array AB as follows: +*> AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl) +*> +*> If FACT = 'F' and EQUED is not 'N', then A must have been +*> equilibrated by the scaling factors in R and/or C. AB is not +*> modified if FACT = 'F' or 'N', or if FACT = 'E' and +*> EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDAB +*> \verbatim +*> LDAB is INTEGER +*> The leading dimension of the array AB. LDAB >= KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] AFB +*> \verbatim +*> AFB is COMPLEX*16 array, dimension (LDAFB,N) +*> If FACT = 'F', then AFB is an input argument and on entry +*> contains details of the LU factorization of the band matrix +*> A, as computed by ZGBTRF. U is stored as an upper triangular +*> band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1, +*> and the multipliers used during the factorization are stored +*> in rows KL+KU+2 to 2*KL+KU+1. If EQUED .ne. 'N', then AFB is +*> the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AFB is an output argument and on exit +*> returns details of the LU factorization of A. +*> +*> If FACT = 'E', then AFB is an output argument and on exit +*> returns details of the LU factorization of the equilibrated +*> matrix A (see the description of AB for the form of the +*> equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAFB +*> \verbatim +*> LDAFB is INTEGER +*> The leading dimension of the array AFB. LDAFB >= 2*KL+KU+1. +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = L*U +*> as computed by ZGBTRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is DOUBLE PRECISION array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,NRHS) +*> On entry, the right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is DOUBLE PRECISION +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is DOUBLE PRECISION array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is DOUBLE PRECISION array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,N)) +*> On exit, RWORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If RWORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 RWORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization +*> has been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16GBsolve +* +* ===================================================================== + SUBROUTINE ZGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, + $ LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX, + $ RCOND, FERR, BERR, WORK, RWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS + DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION BERR( * ), C( * ), FERR( * ), R( * ), + $ RWORK( * ) + COMPLEX*16 AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* Moved setting of INFO = N+1 so INFO does not subsequently get +* overwritten. Sven, 17 Mar 05. +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J, J1, J2 + DOUBLE PRECISION AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DLAMCH, ZLANGB, ZLANTB + EXTERNAL LSAME, DLAMCH, ZLANGB, ZLANTB +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZCOPY, ZGBCON, ZGBEQU, ZGBRFS, ZGBTRF, + $ ZGBTRS, ZLACPY, ZLAQGB +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = DLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( KL.LT.0 ) THEN + INFO = -4 + ELSE IF( KU.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDAB.LT.KL+KU+1 ) THEN + INFO = -8 + ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN + INFO = -10 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -12 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -13 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -14 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -16 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -18 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGBSVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL ZGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL ZLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND, + $ AMAX, EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of the band matrix A. +* + DO 70 J = 1, N + J1 = MAX( J-KU, 1 ) + J2 = MIN( J+KL, N ) + CALL ZCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1, + $ AFB( KL+KU+1-J+J1, J ), 1 ) + 70 CONTINUE +* + CALL ZGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + ANORM = ZERO + DO 90 J = 1, INFO + DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 ) + ANORM = MAX( ANORM, ABS( AB( I, J ) ) ) + 80 CONTINUE + 90 CONTINUE + RPVGRW = ZLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ), + $ AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB, + $ RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ANORM / RPVGRW + END IF + RWORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = ZLANGB( NORM, N, KL, KU, AB, LDAB, RWORK ) + RPVGRW = ZLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ZLANGB( 'M', N, KL, KU, AB, LDAB, RWORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL ZGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND, + $ WORK, RWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL ZLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL ZGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX, + $ INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL ZGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV, + $ B, LDB, X, LDX, FERR, BERR, WORK, RWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 120 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 140 J = 1, NRHS + DO 130 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 130 CONTINUE + 140 CONTINUE + DO 150 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 150 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.DLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + RWORK( 1 ) = RPVGRW + RETURN +* +* End of ZGBSVX +* + END diff --git a/lapack-netlib/zgejsv.f b/lapack-netlib/zgejsv.f new file mode 100644 index 000000000..5fe899e50 --- /dev/null +++ b/lapack-netlib/zgejsv.f @@ -0,0 +1,2234 @@ +*> \brief \b ZGEJSV +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGEJSV + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, +* M, N, A, LDA, SVA, U, LDU, V, LDV, +* CWORK, LWORK, RWORK, LRWORK, IWORK, INFO ) +* +* .. Scalar Arguments .. +* IMPLICIT NONE +* INTEGER INFO, LDA, LDU, LDV, LWORK, M, N +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( LWORK ) +* DOUBLE PRECISION SVA( N ), RWORK( LRWORK ) +* INTEGER IWORK( * ) +* CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGEJSV computes the singular value decomposition (SVD) of a complex M-by-N +*> matrix [A], where M >= N. The SVD of [A] is written as +*> +*> [A] = [U] * [SIGMA] * [V]^*, +*> +*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N +*> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and +*> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are +*> the singular values of [A]. The columns of [U] and [V] are the left and +*> the right singular vectors of [A], respectively. The matrices [U] and [V] +*> are computed and stored in the arrays U and V, respectively. The diagonal +*> of [SIGMA] is computed and stored in the array SVA. +*> \endverbatim +*> +*> Arguments: +*> ========== +*> +*> \param[in] JOBA +*> \verbatim +*> JOBA is CHARACTER*1 +*> Specifies the level of accuracy: +*> = 'C': This option works well (high relative accuracy) if A = B * D, +*> with well-conditioned B and arbitrary diagonal matrix D. +*> The accuracy cannot be spoiled by COLUMN scaling. The +*> accuracy of the computed output depends on the condition of +*> B, and the procedure aims at the best theoretical accuracy. +*> The relative error max_{i=1:N}|d sigma_i| / sigma_i is +*> bounded by f(M,N)*epsilon* cond(B), independent of D. +*> The input matrix is preprocessed with the QRF with column +*> pivoting. This initial preprocessing and preconditioning by +*> a rank revealing QR factorization is common for all values of +*> JOBA. Additional actions are specified as follows: +*> = 'E': Computation as with 'C' with an additional estimate of the +*> condition number of B. It provides a realistic error bound. +*> = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings +*> D1, D2, and well-conditioned matrix C, this option gives +*> higher accuracy than the 'C' option. If the structure of the +*> input matrix is not known, and relative accuracy is +*> desirable, then this option is advisable. The input matrix A +*> is preprocessed with QR factorization with FULL (row and +*> column) pivoting. +*> = 'G': Computation as with 'F' with an additional estimate of the +*> condition number of B, where A=B*D. If A has heavily weighted +*> rows, then using this condition number gives too pessimistic +*> error bound. +*> = 'A': Small singular values are not well determined by the data +*> and are considered as noisy; the matrix is treated as +*> numerically rank deficient. The error in the computed +*> singular values is bounded by f(m,n)*epsilon*||A||. +*> The computed SVD A = U * S * V^* restores A up to +*> f(m,n)*epsilon*||A||. +*> This gives the procedure the licence to discard (set to zero) +*> all singular values below N*epsilon*||A||. +*> = 'R': Similar as in 'A'. Rank revealing property of the initial +*> QR factorization is used do reveal (using triangular factor) +*> a gap sigma_{r+1} < epsilon * sigma_r in which case the +*> numerical RANK is declared to be r. The SVD is computed with +*> absolute error bounds, but more accurately than with 'A'. +*> \endverbatim +*> +*> \param[in] JOBU +*> \verbatim +*> JOBU is CHARACTER*1 +*> Specifies whether to compute the columns of U: +*> = 'U': N columns of U are returned in the array U. +*> = 'F': full set of M left sing. vectors is returned in the array U. +*> = 'W': U may be used as workspace of length M*N. See the description +*> of U. +*> = 'N': U is not computed. +*> \endverbatim +*> +*> \param[in] JOBV +*> \verbatim +*> JOBV is CHARACTER*1 +*> Specifies whether to compute the matrix V: +*> = 'V': N columns of V are returned in the array V; Jacobi rotations +*> are not explicitly accumulated. +*> = 'J': N columns of V are returned in the array V, but they are +*> computed as the product of Jacobi rotations, if JOBT = 'N'. +*> = 'W': V may be used as workspace of length N*N. See the description +*> of V. +*> = 'N': V is not computed. +*> \endverbatim +*> +*> \param[in] JOBR +*> \verbatim +*> JOBR is CHARACTER*1 +*> Specifies the RANGE for the singular values. Issues the licence to +*> set to zero small positive singular values if they are outside +*> specified range. If A .NE. 0 is scaled so that the largest singular +*> value of c*A is around SQRT(BIG), BIG=DLAMCH('O'), then JOBR issues +*> the licence to kill columns of A whose norm in c*A is less than +*> SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN, +*> where SFMIN=DLAMCH('S'), EPSLN=DLAMCH('E'). +*> = 'N': Do not kill small columns of c*A. This option assumes that +*> BLAS and QR factorizations and triangular solvers are +*> implemented to work in that range. If the condition of A +*> is greater than BIG, use ZGESVJ. +*> = 'R': RESTRICTED range for sigma(c*A) is [SQRT(SFMIN), SQRT(BIG)] +*> (roughly, as described above). This option is recommended. +*> =========================== +*> For computing the singular values in the FULL range [SFMIN,BIG] +*> use ZGESVJ. +*> \endverbatim +*> +*> \param[in] JOBT +*> \verbatim +*> JOBT is CHARACTER*1 +*> If the matrix is square then the procedure may determine to use +*> transposed A if A^* seems to be better with respect to convergence. +*> If the matrix is not square, JOBT is ignored. +*> The decision is based on two values of entropy over the adjoint +*> orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7). +*> = 'T': transpose if entropy test indicates possibly faster +*> convergence of Jacobi process if A^* is taken as input. If A is +*> replaced with A^*, then the row pivoting is included automatically. +*> = 'N': do not speculate. +*> The option 'T' can be used to compute only the singular values, or +*> the full SVD (U, SIGMA and V). For only one set of singular vectors +*> (U or V), the caller should provide both U and V, as one of the +*> matrices is used as workspace if the matrix A is transposed. +*> The implementer can easily remove this constraint and make the +*> code more complicated. See the descriptions of U and V. +*> In general, this option is considered experimental, and 'N'; should +*> be preferred. This is subject to changes in the future. +*> \endverbatim +*> +*> \param[in] JOBP +*> \verbatim +*> JOBP is CHARACTER*1 +*> Issues the licence to introduce structured perturbations to drown +*> denormalized numbers. This licence should be active if the +*> denormals are poorly implemented, causing slow computation, +*> especially in cases of fast convergence (!). For details see [1,2]. +*> For the sake of simplicity, this perturbations are included only +*> when the full SVD or only the singular values are requested. The +*> implementer/user can easily add the perturbation for the cases of +*> computing one set of singular vectors. +*> = 'P': introduce perturbation +*> = 'N': do not perturb +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the input matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the input matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] SVA +*> \verbatim +*> SVA is DOUBLE PRECISION array, dimension (N) +*> On exit, +*> - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During +*> the computation SVA contains Euclidean column norms of the +*> iterated matrices in the array A. +*> - For RWORK(1) .NE. RWORK(2): The singular values of A are +*> (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if +*> sigma_max(A) overflows or if small singular values have been +*> saved from underflow by scaling the input matrix A. +*> - If JOBR='R' then some of the singular values may be returned +*> as exact zeros obtained by "set to zero" because they are +*> below the numerical rank threshold or are denormalized numbers. +*> \endverbatim +*> +*> \param[out] U +*> \verbatim +*> U is COMPLEX*16 array, dimension ( LDU, N ) +*> If JOBU = 'U', then U contains on exit the M-by-N matrix of +*> the left singular vectors. +*> If JOBU = 'F', then U contains on exit the M-by-M matrix of +*> the left singular vectors, including an ONB +*> of the orthogonal complement of the Range(A). +*> If JOBU = 'W' .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N), +*> then U is used as workspace if the procedure +*> replaces A with A^*. In that case, [V] is computed +*> in U as left singular vectors of A^* and then +*> copied back to the V array. This 'W' option is just +*> a reminder to the caller that in this case U is +*> reserved as workspace of length N*N. +*> If JOBU = 'N' U is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDU +*> \verbatim +*> LDU is INTEGER +*> The leading dimension of the array U, LDU >= 1. +*> IF JOBU = 'U' or 'F' or 'W', then LDU >= M. +*> \endverbatim +*> +*> \param[out] V +*> \verbatim +*> V is COMPLEX*16 array, dimension ( LDV, N ) +*> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of +*> the right singular vectors; +*> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), +*> then V is used as workspace if the pprocedure +*> replaces A with A^*. In that case, [U] is computed +*> in V as right singular vectors of A^* and then +*> copied back to the U array. This 'W' option is just +*> a reminder to the caller that in this case V is +*> reserved as workspace of length N*N. +*> If JOBV = 'N' V is not referenced, unless JOBT='T'. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V, LDV >= 1. +*> If JOBV = 'V' or 'J' or 'W', then LDV >= N. +*> \endverbatim +*> +*> \param[out] CWORK +*> \verbatim +*> CWORK is COMPLEX*16 array, dimension (MAX(2,LWORK)) +*> If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit CWORK(1) contains the required length of +*> CWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> Length of CWORK to confirm proper allocation of workspace. +*> LWORK depends on the job: +*> +*> 1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and +*> 1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'): +*> LWORK >= 2*N+1. This is the minimal requirement. +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= N + (N+1)*NB. Here NB is the optimal +*> block size for ZGEQP3 and ZGEQRF. +*> In general, optimal LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ)). +*> 1.2. .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). In this case, LWORK the minimal +*> requirement is LWORK >= N*N + 2*N. +*> ->> For optimal performance (blocked code) the optimal value +*> is LWORK >= max(N+(N+1)*NB, N*N+2*N)=N**2+2*N. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ), +*> N*N+LWORK(ZPOCON)). +*> 2. If SIGMA and the right singular vectors are needed (JOBV = 'V'), +*> (JOBU = 'N') +*> 2.1 .. no scaled condition estimate requested (JOBE = 'N'): +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance, +*> LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF, +*> ZUNMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3), N+LWORK(ZGESVJ), +*> N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)). +*> 2.2 .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance, +*> LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF, +*> ZUNMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3), LWORK(ZPOCON), N+LWORK(ZGESVJ), +*> N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)). +*> 3. If SIGMA and the left singular vectors are needed +*> 3.1 .. no scaled condition estimate requested (JOBE = 'N'): +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance: +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)). +*> 3.2 .. an estimate of the scaled condition number of A is +*> required (JOBA='E', or 'G'). +*> -> the minimal requirement is LWORK >= 3*N. +*> -> For optimal performance: +*> if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, +*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR. +*> In general, the optimal length LWORK is computed as +*> LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZPOCON), +*> 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)). +*> 4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and +*> 4.1. if JOBV = 'V' +*> the minimal requirement is LWORK >= 5*N+2*N*N. +*> 4.2. if JOBV = 'J' the minimal requirement is +*> LWORK >= 4*N+N*N. +*> In both cases, the allocated CWORK can accommodate blocked runs +*> of ZGEQP3, ZGEQRF, ZGELQF, SUNMQR, ZUNMLQ. +*> +*> If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit CWORK(1) contains the optimal and CWORK(2) contains the +*> minimal length of CWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (MAX(7,LRWORK)) +*> On exit, +*> RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1) +*> such that SCALE*SVA(1:N) are the computed singular values +*> of A. (See the description of SVA().) +*> RWORK(2) = See the description of RWORK(1). +*> RWORK(3) = SCONDA is an estimate for the condition number of +*> column equilibrated A. (If JOBA = 'E' or 'G') +*> SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1). +*> It is computed using ZPOCON. It holds +*> N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA +*> where R is the triangular factor from the QRF of A. +*> However, if R is truncated and the numerical rank is +*> determined to be strictly smaller than N, SCONDA is +*> returned as -1, thus indicating that the smallest +*> singular values might be lost. +*> +*> If full SVD is needed, the following two condition numbers are +*> useful for the analysis of the algorithm. They are provided for +*> a developer/implementer who is familiar with the details of +*> the method. +*> +*> RWORK(4) = an estimate of the scaled condition number of the +*> triangular factor in the first QR factorization. +*> RWORK(5) = an estimate of the scaled condition number of the +*> triangular factor in the second QR factorization. +*> The following two parameters are computed if JOBT = 'T'. +*> They are provided for a developer/implementer who is familiar +*> with the details of the method. +*> RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy +*> of diag(A^* * A) / Trace(A^* * A) taken as point in the +*> probability simplex. +*> RWORK(7) = the entropy of A * A^*. (See the description of RWORK(6).) +*> If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or +*> LRWORK=-1), then on exit RWORK(1) contains the required length of +*> RWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[in] LRWORK +*> \verbatim +*> LRWORK is INTEGER +*> Length of RWORK to confirm proper allocation of workspace. +*> LRWORK depends on the job: +*> +*> 1. If only the singular values are requested i.e. if +*> LSAME(JOBU,'N') .AND. LSAME(JOBV,'N') +*> then: +*> 1.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then: LRWORK = max( 7, 2 * M ). +*> 1.2. Otherwise, LRWORK = max( 7, N ). +*> 2. If singular values with the right singular vectors are requested +*> i.e. if +*> (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) .AND. +*> .NOT.(LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) +*> then: +*> 2.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 2.2. Otherwise, LRWORK = max( 7, N ). +*> 3. If singular values with the left singular vectors are requested, i.e. if +*> (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND. +*> .NOT.(LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) +*> then: +*> 3.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 3.2. Otherwise, LRWORK = max( 7, N ). +*> 4. If singular values with both the left and the right singular vectors +*> are requested, i.e. if +*> (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND. +*> (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) +*> then: +*> 4.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'), +*> then LRWORK = max( 7, 2 * M ). +*> 4.2. Otherwise, LRWORK = max( 7, N ). +*> +*> If, on entry, LRWORK = -1 or LWORK=-1, a workspace query is assumed and +*> the length of RWORK is returned in RWORK(1). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, of dimension at least 4, that further depends +*> on the job: +*> +*> 1. If only the singular values are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 2. If the singular values and the right singular vectors are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 3. If the singular values and the left singular vectors are requested then: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 4. If the singular values with both the left and the right singular vectors +*> are requested, then: +*> 4.1. If LSAME(JOBV,'J') the length of IWORK is determined as follows: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is N+M; otherwise the length of IWORK is N. +*> 4.2. If LSAME(JOBV,'V') the length of IWORK is determined as follows: +*> If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) +*> then the length of IWORK is 2*N+M; otherwise the length of IWORK is 2*N. +*> +*> On exit, +*> IWORK(1) = the numerical rank determined after the initial +*> QR factorization with pivoting. See the descriptions +*> of JOBA and JOBR. +*> IWORK(2) = the number of the computed nonzero singular values +*> IWORK(3) = if nonzero, a warning message: +*> If IWORK(3) = 1 then some of the column norms of A +*> were denormalized floats. The requested high accuracy +*> is not warranted by the data. +*> IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to +*> do the job as specified by the JOB parameters. +*> If the call to ZGEJSV is a workspace query (indicated by LWORK = -1 or +*> LRWORK = -1), then on exit IWORK(1) contains the required length of +*> IWORK for the job parameters used in the call. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> < 0: if INFO = -i, then the i-th argument had an illegal value. +*> = 0: successful exit; +*> > 0: ZGEJSV did not converge in the maximal allowed number +*> of sweeps. The computed values may be inaccurate. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16GEsing +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> ZGEJSV implements a preconditioned Jacobi SVD algorithm. It uses ZGEQP3, +*> ZGEQRF, and ZGELQF as preprocessors and preconditioners. Optionally, an +*> additional row pivoting can be used as a preprocessor, which in some +*> cases results in much higher accuracy. An example is matrix A with the +*> structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned +*> diagonal matrices and C is well-conditioned matrix. In that case, complete +*> pivoting in the first QR factorizations provides accuracy dependent on the +*> condition number of C, and independent of D1, D2. Such higher accuracy is +*> not completely understood theoretically, but it works well in practice. +*> Further, if A can be written as A = B*D, with well-conditioned B and some +*> diagonal D, then the high accuracy is guaranteed, both theoretically and +*> in software, independent of D. For more details see [1], [2]. +*> The computational range for the singular values can be the full range +*> ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS +*> & LAPACK routines called by ZGEJSV are implemented to work in that range. +*> If that is not the case, then the restriction for safe computation with +*> the singular values in the range of normalized IEEE numbers is that the +*> spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not +*> overflow. This code (ZGEJSV) is best used in this restricted range, +*> meaning that singular values of magnitude below ||A||_2 / DLAMCH('O') are +*> returned as zeros. See JOBR for details on this. +*> Further, this implementation is somewhat slower than the one described +*> in [1,2] due to replacement of some non-LAPACK components, and because +*> the choice of some tuning parameters in the iterative part (ZGESVJ) is +*> left to the implementer on a particular machine. +*> The rank revealing QR factorization (in this code: ZGEQP3) should be +*> implemented as in [3]. We have a new version of ZGEQP3 under development +*> that is more robust than the current one in LAPACK, with a cleaner cut in +*> rank deficient cases. It will be available in the SIGMA library [4]. +*> If M is much larger than N, it is obvious that the initial QRF with +*> column pivoting can be preprocessed by the QRF without pivoting. That +*> well known trick is not used in ZGEJSV because in some cases heavy row +*> weighting can be treated with complete pivoting. The overhead in cases +*> M much larger than N is then only due to pivoting, but the benefits in +*> terms of accuracy have prevailed. The implementer/user can incorporate +*> this extra QRF step easily. The implementer can also improve data movement +*> (matrix transpose, matrix copy, matrix transposed copy) - this +*> implementation of ZGEJSV uses only the simplest, naive data movement. +*> \endverbatim +* +*> \par Contributor: +* ================== +*> +*> Zlatko Drmac, Department of Mathematics, Faculty of Science, +*> University of Zagreb (Zagreb, Croatia); drmac@math.hr +* +*> \par References: +* ================ +*> +*> \verbatim +*> +*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342. +*> LAPACK Working note 169. +*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II. +*> SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362. +*> LAPACK Working note 170. +*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR +*> factorization software - a case study. +*> ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28. +*> LAPACK Working note 176. +*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV, +*> QSVD, (H,K)-SVD computations. +*> Department of Mathematics, University of Zagreb, 2008, 2016. +*> \endverbatim +* +*> \par Bugs, examples and comments: +* ================================= +*> +*> Please report all bugs and send interesting examples and/or comments to +*> drmac@math.hr. Thank you. +*> +* ===================================================================== + SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, + $ M, N, A, LDA, SVA, U, LDU, V, LDV, + $ CWORK, LWORK, RWORK, LRWORK, IWORK, INFO ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + IMPLICIT NONE + INTEGER INFO, LDA, LDU, LDV, LWORK, LRWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), U( LDU, * ), V( LDV, * ), + $ CWORK( LWORK ) + DOUBLE PRECISION SVA( N ), RWORK( LRWORK ) + INTEGER IWORK( * ) + CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV +* .. +* +* =========================================================================== +* +* .. Local Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = ( 0.0D0, 0.0D0 ), CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + COMPLEX*16 CTEMP + DOUBLE PRECISION AAPP, AAQQ, AATMAX, AATMIN, BIG, BIG1, + $ COND_OK, CONDR1, CONDR2, ENTRA, ENTRAT, EPSLN, + $ MAXPRJ, SCALEM, SCONDA, SFMIN, SMALL, TEMP1, + $ USCAL1, USCAL2, XSC + INTEGER IERR, N1, NR, NUMRANK, p, q, WARNING + LOGICAL ALMORT, DEFR, ERREST, GOSCAL, JRACC, KILL, LQUERY, + $ LSVEC, L2ABER, L2KILL, L2PERT, L2RANK, L2TRAN, NOSCAL, + $ ROWPIV, RSVEC, TRANSP +* + INTEGER OPTWRK, MINWRK, MINRWRK, MINIWRK + INTEGER LWCON, LWLQF, LWQP3, LWQRF, LWUNMLQ, LWUNMQR, LWUNMQRM, + $ LWSVDJ, LWSVDJV, LRWQP3, LRWCON, LRWSVDJ, IWOFF + INTEGER LWRK_ZGELQF, LWRK_ZGEQP3, LWRK_ZGEQP3N, LWRK_ZGEQRF, + $ LWRK_ZGESVJ, LWRK_ZGESVJV, LWRK_ZGESVJU, LWRK_ZUNMLQ, + $ LWRK_ZUNMQR, LWRK_ZUNMQRM +* .. +* .. Local Arrays + COMPLEX*16 CDUMMY(1) + DOUBLE PRECISION RDUMMY(1) +* +* .. Intrinsic Functions .. + INTRINSIC ABS, DCMPLX, CONJG, DLOG, MAX, MIN, DBLE, NINT, SQRT +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DZNRM2 + INTEGER IDAMAX, IZAMAX + LOGICAL LSAME + EXTERNAL IDAMAX, IZAMAX, LSAME, DLAMCH, DZNRM2 +* .. +* .. External Subroutines .. + EXTERNAL DLASSQ, ZCOPY, ZGELQF, ZGEQP3, ZGEQRF, ZLACPY, ZLAPMR, + $ ZLASCL, DLASCL, ZLASET, ZLASSQ, ZLASWP, ZUNGQR, ZUNMLQ, + $ ZUNMQR, ZPOCON, DSCAL, ZDSCAL, ZSWAP, ZTRSM, ZLACGV, + $ XERBLA +* + EXTERNAL ZGESVJ +* .. +* +* Test the input arguments +* + LSVEC = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' ) + JRACC = LSAME( JOBV, 'J' ) + RSVEC = LSAME( JOBV, 'V' ) .OR. JRACC + ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' ) + L2RANK = LSAME( JOBA, 'R' ) + L2ABER = LSAME( JOBA, 'A' ) + ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' ) + L2TRAN = LSAME( JOBT, 'T' ) .AND. ( M .EQ. N ) + L2KILL = LSAME( JOBR, 'R' ) + DEFR = LSAME( JOBR, 'N' ) + L2PERT = LSAME( JOBP, 'P' ) +* + LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) +* + IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR. + $ ERREST .OR. LSAME( JOBA, 'C' ) )) THEN + INFO = - 1 + ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR. + $ ( LSAME( JOBU, 'W' ) .AND. RSVEC .AND. L2TRAN ) ) ) THEN + INFO = - 2 + ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR. + $ ( LSAME( JOBV, 'W' ) .AND. LSVEC .AND. L2TRAN ) ) ) THEN + INFO = - 3 + ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) ) THEN + INFO = - 4 + ELSE IF ( .NOT. ( LSAME(JOBT,'T') .OR. LSAME(JOBT,'N') ) ) THEN + INFO = - 5 + ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN + INFO = - 6 + ELSE IF ( M .LT. 0 ) THEN + INFO = - 7 + ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN + INFO = - 8 + ELSE IF ( LDA .LT. M ) THEN + INFO = - 10 + ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN + INFO = - 13 + ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN + INFO = - 15 + ELSE +* #:) + INFO = 0 + END IF +* + IF ( INFO .EQ. 0 ) THEN +* .. compute the minimal and the optimal workspace lengths +* [[The expressions for computing the minimal and the optimal +* values of LCWORK, LRWORK are written with a lot of redundancy and +* can be simplified. However, this verbose form is useful for +* maintenance and modifications of the code.]] +* +* .. minimal workspace length for ZGEQP3 of an M x N matrix, +* ZGEQRF of an N x N matrix, ZGELQF of an N x N matrix, +* ZUNMLQ for computing N x N matrix, ZUNMQR for computing N x N +* matrix, ZUNMQR for computing M x N matrix, respectively. + LWQP3 = N+1 + LWQRF = MAX( 1, N ) + LWLQF = MAX( 1, N ) + LWUNMLQ = MAX( 1, N ) + LWUNMQR = MAX( 1, N ) + LWUNMQRM = MAX( 1, M ) +* .. minimal workspace length for ZPOCON of an N x N matrix + LWCON = 2 * N +* .. minimal workspace length for ZGESVJ of an N x N matrix, +* without and with explicit accumulation of Jacobi rotations + LWSVDJ = MAX( 2 * N, 1 ) + LWSVDJV = MAX( 2 * N, 1 ) +* .. minimal REAL workspace length for ZGEQP3, ZPOCON, ZGESVJ + LRWQP3 = 2 * N + LRWCON = N + LRWSVDJ = N + IF ( LQUERY ) THEN + CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, + $ RDUMMY, IERR ) + LWRK_ZGEQP3 = INT( CDUMMY(1) ) + CALL ZGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) + LWRK_ZGEQRF = INT( CDUMMY(1) ) + CALL ZGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) + LWRK_ZGELQF = INT( CDUMMY(1) ) + END IF + MINWRK = 2 + OPTWRK = 2 + MINIWRK = N + IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN +* .. minimal and optimal sizes of the complex workspace if +* only the singular values are requested + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N**2+LWCON, N+LWQRF, LWSVDJ ) + ELSE + MINWRK = MAX( N+LWQP3, N+LWQRF, LWSVDJ ) + END IF + IF ( LQUERY ) THEN + CALL ZGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, + $ LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_ZGEQP3, N**2+LWCON, + $ N+LWRK_ZGEQRF, LWRK_ZGESVJ ) + ELSE + OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWRK_ZGEQRF, + $ LWRK_ZGESVJ ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWCON, LRWSVDJ ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWCON, LRWSVDJ ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the right singular vectors are requested + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, LWCON, LWSVDJ, N+LWLQF, + $ 2*N+LWQRF, N+LWSVDJ, N+LWUNMLQ ) + ELSE + MINWRK = MAX( N+LWQP3, LWSVDJ, N+LWLQF, 2*N+LWQRF, + $ N+LWSVDJ, N+LWUNMLQ ) + END IF + IF ( LQUERY ) THEN + CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, + $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) + CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_ZGEQP3, LWCON, LWRK_ZGESVJ, + $ N+LWRK_ZGELQF, 2*N+LWRK_ZGEQRF, + $ N+LWRK_ZGESVJ, N+LWRK_ZUNMLQ ) + ELSE + OPTWRK = MAX( N+LWRK_ZGEQP3, LWRK_ZGESVJ,N+LWRK_ZGELQF, + $ 2*N+LWRK_ZGEQRF, N+LWRK_ZGESVJ, + $ N+LWRK_ZUNMLQ ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN +* .. minimal and optimal sizes of the complex workspace if the +* singular values and the left singular vectors are requested + IF ( ERREST ) THEN + MINWRK = N + MAX( LWQP3,LWCON,N+LWQRF,LWSVDJ,LWUNMQRM ) + ELSE + MINWRK = N + MAX( LWQP3, N+LWQRF, LWSVDJ, LWUNMQRM ) + END IF + IF ( LQUERY ) THEN + CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, + $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) + CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, N+LWRK_ZGEQRF, + $ LWRK_ZGESVJ, LWRK_ZUNMQRM ) + ELSE + OPTWRK = N + MAX( LWRK_ZGEQP3, N+LWRK_ZGEQRF, + $ LWRK_ZGESVJ, LWRK_ZUNMQRM ) + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + IF ( ERREST ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ ) + END IF + ELSE + IF ( ERREST ) THEN + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) + END IF + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE +* .. minimal and optimal sizes of the complex workspace if the +* full SVD is requested + IF ( .NOT. JRACC ) THEN + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+N**2+LWCON, + $ 2*N+LWQRF, 2*N+LWQP3, + $ 2*N+N**2+N+LWLQF, 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, + $ 2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, + $ N+N**2+LWSVDJ, N+LWUNMQRM ) + ELSE + MINWRK = MAX( N+LWQP3, 2*N+N**2+LWCON, + $ 2*N+LWQRF, 2*N+LWQP3, + $ 2*N+N**2+N+LWLQF, 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, + $ 2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, + $ N+N**2+LWSVDJ, N+LWUNMQRM ) + END IF + MINIWRK = MINIWRK + N + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + ELSE + IF ( ERREST ) THEN + MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+LWQRF, + $ 2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, + $ N+LWUNMQRM ) + ELSE + MINWRK = MAX( N+LWQP3, 2*N+LWQRF, + $ 2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, + $ N+LWUNMQRM ) + END IF + IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M + END IF + IF ( LQUERY ) THEN + CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) + CALL ZUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) + IF ( .NOT. JRACC ) THEN + CALL ZGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1, + $ RDUMMY, IERR ) + LWRK_ZGEQP3N = INT( CDUMMY(1) ) + CALL ZGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) + CALL ZGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJU = INT( CDUMMY(1) ) + CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJV = INT( CDUMMY(1) ) + CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, + $ 2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, + $ 2*N+LWRK_ZGEQP3N, + $ 2*N+N**2+N+LWRK_ZGELQF, + $ 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWRK_ZGESVJ, + $ 2*N+N**2+N+LWRK_ZGESVJV, + $ 2*N+N**2+N+LWRK_ZUNMQR, + $ 2*N+N**2+N+LWRK_ZUNMLQ, + $ N+N**2+LWRK_ZGESVJU, + $ N+LWRK_ZUNMQRM ) + ELSE + OPTWRK = MAX( N+LWRK_ZGEQP3, + $ 2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, + $ 2*N+LWRK_ZGEQP3N, + $ 2*N+N**2+N+LWRK_ZGELQF, + $ 2*N+N**2+N+N**2+LWCON, + $ 2*N+N**2+N+LWRK_ZGESVJ, + $ 2*N+N**2+N+LWRK_ZGESVJV, + $ 2*N+N**2+N+LWRK_ZUNMQR, + $ 2*N+N**2+N+LWRK_ZUNMLQ, + $ N+N**2+LWRK_ZGESVJU, + $ N+LWRK_ZUNMQRM ) + END IF + ELSE + CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, + $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) + LWRK_ZGESVJV = INT( CDUMMY(1) ) + CALL ZUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY, + $ V, LDV, CDUMMY, -1, IERR ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) + CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, + $ LDU, CDUMMY, -1, IERR ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) + IF ( ERREST ) THEN + OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, + $ 2*N+LWRK_ZGEQRF, 2*N+N**2, + $ 2*N+N**2+LWRK_ZGESVJV, + $ 2*N+N**2+N+LWRK_ZUNMQR,N+LWRK_ZUNMQRM ) + ELSE + OPTWRK = MAX( N+LWRK_ZGEQP3, 2*N+LWRK_ZGEQRF, + $ 2*N+N**2, 2*N+N**2+LWRK_ZGESVJV, + $ 2*N+N**2+N+LWRK_ZUNMQR, + $ N+LWRK_ZUNMQRM ) + END IF + END IF + END IF + IF ( L2TRAN .OR. ROWPIV ) THEN + MINRWRK = MAX( 7, 2*M, LRWQP3, LRWSVDJ, LRWCON ) + ELSE + MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON ) + END IF + END IF + MINWRK = MAX( 2, MINWRK ) + OPTWRK = MAX( MINWRK, OPTWRK ) + IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = - 17 + IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19 + END IF +* + IF ( INFO .NE. 0 ) THEN +* #:( + CALL XERBLA( 'ZGEJSV', - INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + CWORK(1) = OPTWRK + CWORK(2) = MINWRK + RWORK(1) = MINRWRK + IWORK(1) = MAX( 4, MINIWRK ) + RETURN + END IF +* +* Quick return for void matrix (Y3K safe) +* #:) + IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN + IWORK(1:4) = 0 + RWORK(1:7) = 0 + RETURN + ENDIF +* +* Determine whether the matrix U should be M x N or M x M +* + IF ( LSVEC ) THEN + N1 = N + IF ( LSAME( JOBU, 'F' ) ) N1 = M + END IF +* +* Set numerical parameters +* +*! NOTE: Make sure DLAMCH() does not fail on the target architecture. +* + EPSLN = DLAMCH('Epsilon') + SFMIN = DLAMCH('SafeMinimum') + SMALL = SFMIN / EPSLN + BIG = DLAMCH('O') +* BIG = ONE / SFMIN +* +* Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N +* +*(!) If necessary, scale SVA() to protect the largest norm from +* overflow. It is possible that this scaling pushes the smallest +* column norm left from the underflow threshold (extreme case). +* + SCALEM = ONE / SQRT(DBLE(M)*DBLE(N)) + NOSCAL = .TRUE. + GOSCAL = .TRUE. + DO 1874 p = 1, N + AAPP = ZERO + AAQQ = ONE + CALL ZLASSQ( M, A(1,p), 1, AAPP, AAQQ ) + IF ( AAPP .GT. BIG ) THEN + INFO = - 9 + CALL XERBLA( 'ZGEJSV', -INFO ) + RETURN + END IF + AAQQ = SQRT(AAQQ) + IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL ) THEN + SVA(p) = AAPP * AAQQ + ELSE + NOSCAL = .FALSE. + SVA(p) = AAPP * ( AAQQ * SCALEM ) + IF ( GOSCAL ) THEN + GOSCAL = .FALSE. + CALL DSCAL( p-1, SCALEM, SVA, 1 ) + END IF + END IF + 1874 CONTINUE +* + IF ( NOSCAL ) SCALEM = ONE +* + AAPP = ZERO + AAQQ = BIG + DO 4781 p = 1, N + AAPP = MAX( AAPP, SVA(p) ) + IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) ) + 4781 CONTINUE +* +* Quick return for zero M x N matrix +* #:) + IF ( AAPP .EQ. ZERO ) THEN + IF ( LSVEC ) CALL ZLASET( 'G', M, N1, CZERO, CONE, U, LDU ) + IF ( RSVEC ) CALL ZLASET( 'G', N, N, CZERO, CONE, V, LDV ) + RWORK(1) = ONE + RWORK(2) = ONE + IF ( ERREST ) RWORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = ONE + RWORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ZERO + RWORK(7) = ZERO + END IF + IWORK(1) = 0 + IWORK(2) = 0 + IWORK(3) = 0 + IWORK(4) = -1 + RETURN + END IF +* +* Issue warning if denormalized column norms detected. Override the +* high relative accuracy request. Issue licence to kill nonzero columns +* (set them to zero) whose norm is less than sigma_max / BIG (roughly). +* #:( + WARNING = 0 + IF ( AAQQ .LE. SFMIN ) THEN + L2RANK = .TRUE. + L2KILL = .TRUE. + WARNING = 1 + END IF +* +* Quick return for one-column matrix +* #:) + IF ( N .EQ. 1 ) THEN +* + IF ( LSVEC ) THEN + CALL ZLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR ) + CALL ZLACPY( 'A', M, 1, A, LDA, U, LDU ) +* computing all M left singular vectors of the M x 1 matrix + IF ( N1 .NE. N ) THEN + CALL ZGEQRF( M, N, U,LDU, CWORK, CWORK(N+1),LWORK-N,IERR ) + CALL ZUNGQR( M,N1,1, U,LDU,CWORK,CWORK(N+1),LWORK-N,IERR ) + CALL ZCOPY( M, A(1,1), 1, U(1,1), 1 ) + END IF + END IF + IF ( RSVEC ) THEN + V(1,1) = CONE + END IF + IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN + SVA(1) = SVA(1) / SCALEM + SCALEM = ONE + END IF + RWORK(1) = ONE / SCALEM + RWORK(2) = ONE + IF ( SVA(1) .NE. ZERO ) THEN + IWORK(1) = 1 + IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN + IWORK(2) = 1 + ELSE + IWORK(2) = 0 + END IF + ELSE + IWORK(1) = 0 + IWORK(2) = 0 + END IF + IWORK(3) = 0 + IWORK(4) = -1 + IF ( ERREST ) RWORK(3) = ONE + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = ONE + RWORK(5) = ONE + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ZERO + RWORK(7) = ZERO + END IF + RETURN +* + END IF +* + TRANSP = .FALSE. +* + AATMAX = -ONE + AATMIN = BIG + IF ( ROWPIV .OR. L2TRAN ) THEN +* +* Compute the row norms, needed to determine row pivoting sequence +* (in the case of heavily row weighted A, row pivoting is strongly +* advised) and to collect information needed to compare the +* structures of A * A^* and A^* * A (in the case L2TRAN.EQ..TRUE.). +* + IF ( L2TRAN ) THEN + DO 1950 p = 1, M + XSC = ZERO + TEMP1 = ONE + CALL ZLASSQ( N, A(p,1), LDA, XSC, TEMP1 ) +* ZLASSQ gets both the ell_2 and the ell_infinity norm +* in one pass through the vector + RWORK(M+p) = XSC * SCALEM + RWORK(p) = XSC * (SCALEM*SQRT(TEMP1)) + AATMAX = MAX( AATMAX, RWORK(p) ) + IF (RWORK(p) .NE. ZERO) + $ AATMIN = MIN(AATMIN,RWORK(p)) + 1950 CONTINUE + ELSE + DO 1904 p = 1, M + RWORK(M+p) = SCALEM*ABS( A(p,IZAMAX(N,A(p,1),LDA)) ) + AATMAX = MAX( AATMAX, RWORK(M+p) ) + AATMIN = MIN( AATMIN, RWORK(M+p) ) + 1904 CONTINUE + END IF +* + END IF +* +* For square matrix A try to determine whether A^* would be better +* input for the preconditioned Jacobi SVD, with faster convergence. +* The decision is based on an O(N) function of the vector of column +* and row norms of A, based on the Shannon entropy. This should give +* the right choice in most cases when the difference actually matters. +* It may fail and pick the slower converging side. +* + ENTRA = ZERO + ENTRAT = ZERO + IF ( L2TRAN ) THEN +* + XSC = ZERO + TEMP1 = ONE + CALL DLASSQ( N, SVA, 1, XSC, TEMP1 ) + TEMP1 = ONE / TEMP1 +* + ENTRA = ZERO + DO 1113 p = 1, N + BIG1 = ( ( SVA(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * DLOG(BIG1) + 1113 CONTINUE + ENTRA = - ENTRA / DLOG(DBLE(N)) +* +* Now, SVA().^2/Trace(A^* * A) is a point in the probability simplex. +* It is derived from the diagonal of A^* * A. Do the same with the +* diagonal of A * A^*, compute the entropy of the corresponding +* probability distribution. Note that A * A^* and A^* * A have the +* same trace. +* + ENTRAT = ZERO + DO 1114 p = 1, M + BIG1 = ( ( RWORK(p) / XSC )**2 ) * TEMP1 + IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * DLOG(BIG1) + 1114 CONTINUE + ENTRAT = - ENTRAT / DLOG(DBLE(M)) +* +* Analyze the entropies and decide A or A^*. Smaller entropy +* usually means better input for the algorithm. +* + TRANSP = ( ENTRAT .LT. ENTRA ) +* +* If A^* is better than A, take the adjoint of A. This is allowed +* only for square matrices, M=N. + IF ( TRANSP ) THEN +* In an optimal implementation, this trivial transpose +* should be replaced with faster transpose. + DO 1115 p = 1, N - 1 + A(p,p) = CONJG(A(p,p)) + DO 1116 q = p + 1, N + CTEMP = CONJG(A(q,p)) + A(q,p) = CONJG(A(p,q)) + A(p,q) = CTEMP + 1116 CONTINUE + 1115 CONTINUE + A(N,N) = CONJG(A(N,N)) + DO 1117 p = 1, N + RWORK(M+p) = SVA(p) + SVA(p) = RWORK(p) +* previously computed row 2-norms are now column 2-norms +* of the transposed matrix + 1117 CONTINUE + TEMP1 = AAPP + AAPP = AATMAX + AATMAX = TEMP1 + TEMP1 = AAQQ + AAQQ = AATMIN + AATMIN = TEMP1 + KILL = LSVEC + LSVEC = RSVEC + RSVEC = KILL + IF ( LSVEC ) N1 = N +* + ROWPIV = .TRUE. + END IF +* + END IF +* END IF L2TRAN +* +* Scale the matrix so that its maximal singular value remains less +* than SQRT(BIG) -- the matrix is scaled so that its maximal column +* has Euclidean norm equal to SQRT(BIG/N). The only reason to keep +* SQRT(BIG) instead of BIG is the fact that ZGEJSV uses LAPACK and +* BLAS routines that, in some implementations, are not capable of +* working in the full interval [SFMIN,BIG] and that they may provoke +* overflows in the intermediate results. If the singular values spread +* from SFMIN to BIG, then ZGESVJ will compute them. So, in that case, +* one should use ZGESVJ instead of ZGEJSV. +* >> change in the April 2016 update: allow bigger range, i.e. the +* largest column is allowed up to BIG/N and ZGESVJ will do the rest. + BIG1 = SQRT( BIG ) + TEMP1 = SQRT( BIG / DBLE(N) ) +* TEMP1 = BIG/DBLE(N) +* + CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR ) + IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN + AAQQ = ( AAQQ / AAPP ) * TEMP1 + ELSE + AAQQ = ( AAQQ * TEMP1 ) / AAPP + END IF + TEMP1 = TEMP1 * SCALEM + CALL ZLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR ) +* +* To undo scaling at the end of this procedure, multiply the +* computed singular values with USCAL2 / USCAL1. +* + USCAL1 = TEMP1 + USCAL2 = AAPP +* + IF ( L2KILL ) THEN +* L2KILL enforces computation of nonzero singular values in +* the restricted range of condition number of the initial A, +* sigma_max(A) / sigma_min(A) approx. SQRT(BIG)/SQRT(SFMIN). + XSC = SQRT( SFMIN ) + ELSE + XSC = SMALL +* +* Now, if the condition number of A is too big, +* sigma_max(A) / sigma_min(A) .GT. SQRT(BIG/N) * EPSLN / SFMIN, +* as a precaution measure, the full SVD is computed using ZGESVJ +* with accumulated Jacobi rotations. This provides numerically +* more robust computation, at the cost of slightly increased run +* time. Depending on the concrete implementation of BLAS and LAPACK +* (i.e. how they behave in presence of extreme ill-conditioning) the +* implementor may decide to remove this switch. + IF ( ( AAQQ.LT.SQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN + JRACC = .TRUE. + END IF +* + END IF + IF ( AAQQ .LT. XSC ) THEN + DO 700 p = 1, N + IF ( SVA(p) .LT. XSC ) THEN + CALL ZLASET( 'A', M, 1, CZERO, CZERO, A(1,p), LDA ) + SVA(p) = ZERO + END IF + 700 CONTINUE + END IF +* +* Preconditioning using QR factorization with pivoting +* + IF ( ROWPIV ) THEN +* Optional row permutation (Bjoerck row pivoting): +* A result by Cox and Higham shows that the Bjoerck's +* row pivoting combined with standard column pivoting +* has similar effect as Powell-Reid complete pivoting. +* The ell-infinity norms of A are made nonincreasing. + IF ( ( LSVEC .AND. RSVEC ) .AND. .NOT.( JRACC ) ) THEN + IWOFF = 2*N + ELSE + IWOFF = N + END IF + DO 1952 p = 1, M - 1 + q = IDAMAX( M-p+1, RWORK(M+p), 1 ) + p - 1 + IWORK(IWOFF+p) = q + IF ( p .NE. q ) THEN + TEMP1 = RWORK(M+p) + RWORK(M+p) = RWORK(M+q) + RWORK(M+q) = TEMP1 + END IF + 1952 CONTINUE + CALL ZLASWP( N, A, LDA, 1, M-1, IWORK(IWOFF+1), 1 ) + END IF +* +* End of the preparation phase (scaling, optional sorting and +* transposing, optional flushing of small columns). +* +* Preconditioning +* +* If the full SVD is needed, the right singular vectors are computed +* from a matrix equation, and for that we need theoretical analysis +* of the Businger-Golub pivoting. So we use ZGEQP3 as the first RR QRF. +* In all other cases the first RR QRF can be chosen by other criteria +* (eg speed by replacing global with restricted window pivoting, such +* as in xGEQPX from TOMS # 782). Good results will be obtained using +* xGEQPX with properly (!) chosen numerical parameters. +* Any improvement of ZGEQP3 improves overall performance of ZGEJSV. +* +* A * P1 = Q1 * [ R1^* 0]^*: + DO 1963 p = 1, N +* .. all columns are free columns + IWORK(p) = 0 + 1963 CONTINUE + CALL ZGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LWORK-N, + $ RWORK, IERR ) +* +* The upper triangular matrix R1 from the first QRF is inspected for +* rank deficiency and possibilities for deflation, or possible +* ill-conditioning. Depending on the user specified flag L2RANK, +* the procedure explores possibilities to reduce the numerical +* rank by inspecting the computed upper triangular factor. If +* L2RANK or L2ABER are up, then ZGEJSV will compute the SVD of +* A + dA, where ||dA|| <= f(M,N)*EPSLN. +* + NR = 1 + IF ( L2ABER ) THEN +* Standard absolute error bound suffices. All sigma_i with +* sigma_i < N*EPSLN*||A|| are flushed to zero. This is an +* aggressive enforcement of lower numerical rank by introducing a +* backward error of the order of N*EPSLN*||A||. + TEMP1 = SQRT(DBLE(N))*EPSLN + DO 3001 p = 2, N + IF ( ABS(A(p,p)) .GE. (TEMP1*ABS(A(1,1))) ) THEN + NR = NR + 1 + ELSE + GO TO 3002 + END IF + 3001 CONTINUE + 3002 CONTINUE + ELSE IF ( L2RANK ) THEN +* .. similarly as above, only slightly more gentle (less aggressive). +* Sudden drop on the diagonal of R1 is used as the criterion for +* close-to-rank-deficient. + TEMP1 = SQRT(SFMIN) + DO 3401 p = 2, N + IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR. + $ ( ABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402 + NR = NR + 1 + 3401 CONTINUE + 3402 CONTINUE +* + ELSE +* The goal is high relative accuracy. However, if the matrix +* has high scaled condition number the relative accuracy is in +* general not feasible. Later on, a condition number estimator +* will be deployed to estimate the scaled condition number. +* Here we just remove the underflowed part of the triangular +* factor. This prevents the situation in which the code is +* working hard to get the accuracy not warranted by the data. + TEMP1 = SQRT(SFMIN) + DO 3301 p = 2, N + IF ( ( ABS(A(p,p)) .LT. SMALL ) .OR. + $ ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302 + NR = NR + 1 + 3301 CONTINUE + 3302 CONTINUE +* + END IF +* + ALMORT = .FALSE. + IF ( NR .EQ. N ) THEN + MAXPRJ = ONE + DO 3051 p = 2, N + TEMP1 = ABS(A(p,p)) / SVA(IWORK(p)) + MAXPRJ = MIN( MAXPRJ, TEMP1 ) + 3051 CONTINUE + IF ( MAXPRJ**2 .GE. ONE - DBLE(N)*EPSLN ) ALMORT = .TRUE. + END IF +* +* + SCONDA = - ONE + CONDR1 = - ONE + CONDR2 = - ONE +* + IF ( ERREST ) THEN + IF ( N .EQ. NR ) THEN + IF ( RSVEC ) THEN +* .. V is available as workspace + CALL ZLACPY( 'U', N, N, A, LDA, V, LDV ) + DO 3053 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL ZDSCAL( p, ONE/TEMP1, V(1,p), 1 ) + 3053 CONTINUE + IF ( LSVEC )THEN + CALL ZPOCON( 'U', N, V, LDV, ONE, TEMP1, + $ CWORK(N+1), RWORK, IERR ) + ELSE + CALL ZPOCON( 'U', N, V, LDV, ONE, TEMP1, + $ CWORK, RWORK, IERR ) + END IF +* + ELSE IF ( LSVEC ) THEN +* .. U is available as workspace + CALL ZLACPY( 'U', N, N, A, LDA, U, LDU ) + DO 3054 p = 1, N + TEMP1 = SVA(IWORK(p)) + CALL ZDSCAL( p, ONE/TEMP1, U(1,p), 1 ) + 3054 CONTINUE + CALL ZPOCON( 'U', N, U, LDU, ONE, TEMP1, + $ CWORK(N+1), RWORK, IERR ) + ELSE + CALL ZLACPY( 'U', N, N, A, LDA, CWORK, N ) +*[] CALL ZLACPY( 'U', N, N, A, LDA, CWORK(N+1), N ) +* Change: here index shifted by N to the left, CWORK(1:N) +* not needed for SIGMA only computation + DO 3052 p = 1, N + TEMP1 = SVA(IWORK(p)) +*[] CALL ZDSCAL( p, ONE/TEMP1, CWORK(N+(p-1)*N+1), 1 ) + CALL ZDSCAL( p, ONE/TEMP1, CWORK((p-1)*N+1), 1 ) + 3052 CONTINUE +* .. the columns of R are scaled to have unit Euclidean lengths. +*[] CALL ZPOCON( 'U', N, CWORK(N+1), N, ONE, TEMP1, +*[] $ CWORK(N+N*N+1), RWORK, IERR ) + CALL ZPOCON( 'U', N, CWORK, N, ONE, TEMP1, + $ CWORK(N*N+1), RWORK, IERR ) +* + END IF + IF ( TEMP1 .NE. ZERO ) THEN + SCONDA = ONE / SQRT(TEMP1) + ELSE + SCONDA = - ONE + END IF +* SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1). +* N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA + ELSE + SCONDA = - ONE + END IF + END IF +* + L2PERT = L2PERT .AND. ( ABS( A(1,1)/A(NR,NR) ) .GT. SQRT(BIG1) ) +* If there is no violent scaling, artificial perturbation is not needed. +* +* Phase 3: +* + IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN +* +* Singular Values only +* +* .. transpose A(1:NR,1:N) + DO 1946 p = 1, MIN( N-1, NR ) + CALL ZCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 ) + CALL ZLACGV( N-p+1, A(p,p), 1 ) + 1946 CONTINUE + IF ( NR .EQ. N ) A(N,N) = CONJG(A(N,N)) +* +* The following two DO-loops introduce small relative perturbation +* into the strict upper triangle of the lower triangular matrix. +* Small entries below the main diagonal are also changed. +* This modification is useful if the computing environment does not +* provide/allow FLUSH TO ZERO underflow, for it prevents many +* annoying denormalized numbers in case of strongly scaled matrices. +* The perturbation is structured so that it does not introduce any +* new perturbation of the singular values, and it does not destroy +* the job done by the preconditioner. +* The licence for this perturbation is in the variable L2PERT, which +* should be .FALSE. if FLUSH TO ZERO underflow is active. +* + IF ( .NOT. ALMORT ) THEN +* + IF ( L2PERT ) THEN +* XSC = SQRT(SMALL) + XSC = EPSLN / DBLE(N) + DO 4947 q = 1, NR + CTEMP = DCMPLX(XSC*ABS(A(q,q)),ZERO) + DO 4949 p = 1, N + IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) +* $ A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) ) + $ A(p,q) = CTEMP + 4949 CONTINUE + 4947 CONTINUE + ELSE + CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, A(1,2),LDA ) + END IF +* +* .. second preconditioning using the QR factorization +* + CALL ZGEQRF( N,NR, A,LDA, CWORK, CWORK(N+1),LWORK-N, IERR ) +* +* .. and transpose upper to lower triangular + DO 1948 p = 1, NR - 1 + CALL ZCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 ) + CALL ZLACGV( NR-p+1, A(p,p), 1 ) + 1948 CONTINUE +* + END IF +* +* Row-cyclic Jacobi SVD algorithm with column pivoting +* +* .. again some perturbation (a "background noise") is added +* to drown denormals + IF ( L2PERT ) THEN +* XSC = SQRT(SMALL) + XSC = EPSLN / DBLE(N) + DO 1947 q = 1, NR + CTEMP = DCMPLX(XSC*ABS(A(q,q)),ZERO) + DO 1949 p = 1, NR + IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) ) + $ .OR. ( p .LT. q ) ) +* $ A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) ) + $ A(p,q) = CTEMP + 1949 CONTINUE + 1947 CONTINUE + ELSE + CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, A(1,2), LDA ) + END IF +* +* .. and one-sided Jacobi rotations are started on a lower +* triangular matrix (plus perturbation which is ignored in +* the part which destroys triangular form (confusing?!)) +* + CALL ZGESVJ( 'L', 'N', 'N', NR, NR, A, LDA, SVA, + $ N, V, LDV, CWORK, LWORK, RWORK, LRWORK, INFO ) +* + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) +* +* + ELSE IF ( ( RSVEC .AND. ( .NOT. LSVEC ) .AND. ( .NOT. JRACC ) ) + $ .OR. + $ ( JRACC .AND. ( .NOT. LSVEC ) .AND. ( NR .NE. N ) ) ) THEN +* +* -> Singular Values and Right Singular Vectors <- +* + IF ( ALMORT ) THEN +* +* .. in this case NR equals N + DO 1998 p = 1, NR + CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL ZLACGV( N-p+1, V(p,p), 1 ) + 1998 CONTINUE + CALL ZLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV ) +* + CALL ZGESVJ( 'L','U','N', N, NR, V, LDV, SVA, NR, A, LDA, + $ CWORK, LWORK, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + + ELSE +* +* .. two more QR factorizations ( one QRF is not enough, two require +* accumulated product of Jacobi rotations, three are perfect ) +* + CALL ZLASET( 'L', NR-1,NR-1, CZERO, CZERO, A(2,1), LDA ) + CALL ZGELQF( NR,N, A, LDA, CWORK, CWORK(N+1), LWORK-N, IERR) + CALL ZLACPY( 'L', NR, NR, A, LDA, V, LDV ) + CALL ZLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV ) + CALL ZGEQRF( NR, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) + DO 8998 p = 1, NR + CALL ZCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 ) + CALL ZLACGV( NR-p+1, V(p,p), 1 ) + 8998 CONTINUE + CALL ZLASET('U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV) +* + CALL ZGESVJ( 'L', 'U','N', NR, NR, V,LDV, SVA, NR, U, + $ LDU, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + IF ( NR .LT. N ) THEN + CALL ZLASET( 'A',N-NR, NR, CZERO,CZERO, V(NR+1,1), LDV ) + CALL ZLASET( 'A',NR, N-NR, CZERO,CZERO, V(1,NR+1), LDV ) + CALL ZLASET( 'A',N-NR,N-NR,CZERO,CONE, V(NR+1,NR+1),LDV ) + END IF +* + CALL ZUNMLQ( 'L', 'C', N, N, NR, A, LDA, CWORK, + $ V, LDV, CWORK(N+1), LWORK-N, IERR ) +* + END IF +* .. permute the rows of V +* DO 8991 p = 1, N +* CALL ZCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA ) +* 8991 CONTINUE +* CALL ZLACPY( 'All', N, N, A, LDA, V, LDV ) + CALL ZLAPMR( .FALSE., N, N, V, LDV, IWORK ) +* + IF ( TRANSP ) THEN + CALL ZLACPY( 'A', N, N, V, LDV, U, LDU ) + END IF +* + ELSE IF ( JRACC .AND. (.NOT. LSVEC) .AND. ( NR.EQ. N ) ) THEN +* + CALL ZLASET( 'L', N-1,N-1, CZERO, CZERO, A(2,1), LDA ) +* + CALL ZGESVJ( 'U','N','V', N, N, A, LDA, SVA, N, V, LDV, + $ CWORK, LWORK, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + CALL ZLAPMR( .FALSE., N, N, V, LDV, IWORK ) +* + ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN +* +* .. Singular Values and Left Singular Vectors .. +* +* .. second preconditioning step to avoid need to accumulate +* Jacobi rotations in the Jacobi iterations. + DO 1965 p = 1, NR + CALL ZCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 ) + CALL ZLACGV( N-p+1, U(p,p), 1 ) + 1965 CONTINUE + CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) +* + CALL ZGEQRF( N, NR, U, LDU, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) +* + DO 1967 p = 1, NR - 1 + CALL ZCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 ) + CALL ZLACGV( N-p+1, U(p,p), 1 ) + 1967 CONTINUE + CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) +* + CALL ZGESVJ( 'L', 'U', 'N', NR,NR, U, LDU, SVA, NR, A, + $ LDA, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) +* + IF ( NR .LT. M ) THEN + CALL ZLASET( 'A', M-NR, NR,CZERO, CZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL ZLASET( 'A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU ) + CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE,U(NR+1,NR+1),LDU ) + END IF + END IF +* + CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + DO 1974 p = 1, N1 + XSC = ONE / DZNRM2( M, U(1,p), 1 ) + CALL ZDSCAL( M, XSC, U(1,p), 1 ) + 1974 CONTINUE +* + IF ( TRANSP ) THEN + CALL ZLACPY( 'A', N, N, U, LDU, V, LDV ) + END IF +* + ELSE +* +* .. Full SVD .. +* + IF ( .NOT. JRACC ) THEN +* + IF ( .NOT. ALMORT ) THEN +* +* Second Preconditioning Step (QRF [with pivoting]) +* Note that the composition of TRANSPOSE, QRF and TRANSPOSE is +* equivalent to an LQF CALL. Since in many libraries the QRF +* seems to be better optimized than the LQF, we do explicit +* transpose and use the QRF. This is subject to changes in an +* optimized implementation of ZGEJSV. +* + DO 1968 p = 1, NR + CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL ZLACGV( N-p+1, V(p,p), 1 ) + 1968 CONTINUE +* +* .. the following two loops perturb small entries to avoid +* denormals in the second QR factorization, where they are +* as good as zeros. This is done to avoid painfully slow +* computation with denormals. The relative size of the perturbation +* is a parameter that can be changed by the implementer. +* This perturbation device will be obsolete on machines with +* properly implemented arithmetic. +* To switch it off, set L2PERT=.FALSE. To remove it from the +* code, remove the action under L2PERT=.TRUE., leave the ELSE part. +* The following two loops should be blocked and fused with the +* transposed copy above. +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 2969 q = 1, NR + CTEMP = DCMPLX(XSC*ABS( V(q,q) ),ZERO) + DO 2968 p = 1, N + IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) +* $ V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) ) + $ V(p,q) = CTEMP + IF ( p .LT. q ) V(p,q) = - V(p,q) + 2968 CONTINUE + 2969 CONTINUE + ELSE + CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV ) + END IF +* +* Estimate the row scaled condition number of R1 +* (If R1 is rectangular, N > NR, then the condition number +* of the leading NR x NR submatrix is estimated.) +* + CALL ZLACPY( 'L', NR, NR, V, LDV, CWORK(2*N+1), NR ) + DO 3950 p = 1, NR + TEMP1 = DZNRM2(NR-p+1,CWORK(2*N+(p-1)*NR+p),1) + CALL ZDSCAL(NR-p+1,ONE/TEMP1,CWORK(2*N+(p-1)*NR+p),1) + 3950 CONTINUE + CALL ZPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1, + $ CWORK(2*N+NR*NR+1),RWORK,IERR) + CONDR1 = ONE / SQRT(TEMP1) +* .. here need a second opinion on the condition number +* .. then assume worst case scenario +* R1 is OK for inverse <=> CONDR1 .LT. DBLE(N) +* more conservative <=> CONDR1 .LT. SQRT(DBLE(N)) +* + COND_OK = SQRT(SQRT(DBLE(NR))) +*[TP] COND_OK is a tuning parameter. +* + IF ( CONDR1 .LT. COND_OK ) THEN +* .. the second QRF without pivoting. Note: in an optimized +* implementation, this QRF should be implemented as the QRF +* of a lower triangular matrix. +* R1^* = Q2 * R2 + CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL)/EPSLN + DO 3959 p = 2, NR + DO 3958 q = 1, p - 1 + CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) + IF ( ABS(V(q,p)) .LE. TEMP1 ) +* $ V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) ) + $ V(q,p) = CTEMP + 3958 CONTINUE + 3959 CONTINUE + END IF +* + IF ( NR .NE. N ) + $ CALL ZLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N ) +* .. save ... +* +* .. this transposed copy should be better than naive + DO 1969 p = 1, NR - 1 + CALL ZCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 ) + CALL ZLACGV(NR-p+1, V(p,p), 1 ) + 1969 CONTINUE + V(NR,NR)=CONJG(V(NR,NR)) +* + CONDR2 = CONDR1 +* + ELSE +* +* .. ill-conditioned case: second QRF with pivoting +* Note that windowed pivoting would be equally good +* numerically, and more run-time efficient. So, in +* an optimal implementation, the next call to ZGEQP3 +* should be replaced with eg. CALL ZGEQPX (ACM TOMS #782) +* with properly (carefully) chosen parameters. +* +* R1^* * P2 = Q2 * R2 + DO 3003 p = 1, NR + IWORK(N+p) = 0 + 3003 CONTINUE + CALL ZGEQP3( N, NR, V, LDV, IWORK(N+1), CWORK(N+1), + $ CWORK(2*N+1), LWORK-2*N, RWORK, IERR ) +** CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), +** $ LWORK-2*N, IERR ) + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 3969 p = 2, NR + DO 3968 q = 1, p - 1 + CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) + IF ( ABS(V(q,p)) .LE. TEMP1 ) +* $ V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) ) + $ V(q,p) = CTEMP + 3968 CONTINUE + 3969 CONTINUE + END IF +* + CALL ZLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N ) +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 8970 p = 2, NR + DO 8971 q = 1, p - 1 + CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))), + $ ZERO) +* V(p,q) = - TEMP1*( V(q,p) / ABS(V(q,p)) ) + V(p,q) = - CTEMP + 8971 CONTINUE + 8970 CONTINUE + ELSE + CALL ZLASET( 'L',NR-1,NR-1,CZERO,CZERO,V(2,1),LDV ) + END IF +* Now, compute R2 = L3 * Q3, the LQ factorization. + CALL ZGELQF( NR, NR, V, LDV, CWORK(2*N+N*NR+1), + $ CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR ) +* .. and estimate the condition number + CALL ZLACPY( 'L',NR,NR,V,LDV,CWORK(2*N+N*NR+NR+1),NR ) + DO 4950 p = 1, NR + TEMP1 = DZNRM2( p, CWORK(2*N+N*NR+NR+p), NR ) + CALL ZDSCAL( p, ONE/TEMP1, CWORK(2*N+N*NR+NR+p), NR ) + 4950 CONTINUE + CALL ZPOCON( 'L',NR,CWORK(2*N+N*NR+NR+1),NR,ONE,TEMP1, + $ CWORK(2*N+N*NR+NR+NR*NR+1),RWORK,IERR ) + CONDR2 = ONE / SQRT(TEMP1) +* +* + IF ( CONDR2 .GE. COND_OK ) THEN +* .. save the Householder vectors used for Q3 +* (this overwrites the copy of R2, as it will not be +* needed in this branch, but it does not overwritte the +* Huseholder vectors of Q2.). + CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N ) +* .. and the rest of the information on Q3 is in +* WORK(2*N+N*NR+1:2*N+N*NR+N) + END IF +* + END IF +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 4968 q = 2, NR + CTEMP = XSC * V(q,q) + DO 4969 p = 1, q - 1 +* V(p,q) = - TEMP1*( V(p,q) / ABS(V(p,q)) ) + V(p,q) = - CTEMP + 4969 CONTINUE + 4968 CONTINUE + ELSE + CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV ) + END IF +* +* Second preconditioning finished; continue with Jacobi SVD +* The input matrix is lower trinagular. +* +* Recover the right singular vectors as solution of a well +* conditioned triangular matrix equation. +* + IF ( CONDR1 .LT. COND_OK ) THEN +* + CALL ZGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, LDU, + $ CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,RWORK, + $ LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 3970 p = 1, NR + CALL ZCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL ZDSCAL( NR, SVA(p), V(1,p), 1 ) + 3970 CONTINUE + +* .. pick the right matrix equation and solve it +* + IF ( NR .EQ. N ) THEN +* :)) .. best case, R1 is inverted. The solution of this matrix +* equation is Q2*V2 = the product of the Jacobi rotations +* used in ZGESVJ, premultiplied with the orthogonal matrix +* from the second QR factorization. + CALL ZTRSM('L','U','N','N', NR,NR,CONE, A,LDA, V,LDV) + ELSE +* .. R1 is well conditioned, but non-square. Adjoint of R2 +* is inverted to get the product of the Jacobi rotations +* used in ZGESVJ. The Q-factor from the second QR +* factorization is then built in explicitly. + CALL ZTRSM('L','U','C','N',NR,NR,CONE,CWORK(2*N+1), + $ N,V,LDV) + IF ( NR .LT. N ) THEN + CALL ZLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV) + CALL ZLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV) + CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL ZUNMQR('L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR) + END IF +* + ELSE IF ( CONDR2 .LT. COND_OK ) THEN +* +* The matrix R2 is inverted. The solution of the matrix equation +* is Q3^* * V3 = the product of the Jacobi rotations (appplied to +* the lower triangular L3 from the LQ factorization of +* R2=L3*Q3), pre-multiplied with the transposed Q3. + CALL ZGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, + $ LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 3870 p = 1, NR + CALL ZCOPY( NR, V(1,p), 1, U(1,p), 1 ) + CALL ZDSCAL( NR, SVA(p), U(1,p), 1 ) + 3870 CONTINUE + CALL ZTRSM('L','U','N','N',NR,NR,CONE,CWORK(2*N+1),N, + $ U,LDU) +* .. apply the permutation from the second QR factorization + DO 873 q = 1, NR + DO 872 p = 1, NR + CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 872 CONTINUE + DO 874 p = 1, NR + U(p,q) = CWORK(2*N+N*NR+NR+p) + 874 CONTINUE + 873 CONTINUE + IF ( NR .LT. N ) THEN + CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) + ELSE +* Last line of defense. +* #:( This is a rather pathological case: no scaled condition +* improvement after two pivoted QR factorizations. Other +* possibility is that the rank revealing QR factorization +* or the condition estimator has failed, or the COND_OK +* is set very close to ONE (which is unnecessary). Normally, +* this branch should never be executed, but in rare cases of +* failure of the RRQR or condition estimator, the last line of +* defense ensures that ZGEJSV completes the task. +* Compute the full SVD of L3 using ZGESVJ with explicit +* accumulation of Jacobi rotations. + CALL ZGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U, + $ LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + IF ( NR .LT. N ) THEN + CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV) + END IF + CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* + CALL ZUNMLQ( 'L', 'C', NR, NR, NR, CWORK(2*N+1), N, + $ CWORK(2*N+N*NR+1), U, LDU, CWORK(2*N+N*NR+NR+1), + $ LWORK-2*N-N*NR-NR, IERR ) + DO 773 q = 1, NR + DO 772 p = 1, NR + CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q) + 772 CONTINUE + DO 774 p = 1, NR + U(p,q) = CWORK(2*N+N*NR+NR+p) + 774 CONTINUE + 773 CONTINUE +* + END IF +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = SQRT(DBLE(N)) * EPSLN + DO 1972 q = 1, N + DO 972 p = 1, N + CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 972 CONTINUE + DO 973 p = 1, N + V(p,q) = CWORK(2*N+N*NR+NR+p) + 973 CONTINUE + XSC = ONE / DZNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL ZDSCAL( N, XSC, V(1,q), 1 ) + 1972 CONTINUE +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). + IF ( NR .LT. M ) THEN + CALL ZLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU) + IF ( NR .LT. N1 ) THEN + CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU) + CALL ZLASET('A',M-NR,N1-NR,CZERO,CONE, + $ U(NR+1,NR+1),LDU) + END IF + END IF +* +* The Q matrix from the first QRF is built into the left singular +* matrix U. This applies to all cases. +* + CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) + +* The columns of U are normalized. The cost is O(M*N) flops. + TEMP1 = SQRT(DBLE(M)) * EPSLN + DO 1973 p = 1, NR + XSC = ONE / DZNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL ZDSCAL( M, XSC, U(1,p), 1 ) + 1973 CONTINUE +* +* If the initial QRF is computed with row pivoting, the left +* singular vectors must be adjusted. +* + IF ( ROWPIV ) + $ CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + ELSE +* +* .. the initial matrix A has almost orthogonal columns and +* the second QRF is not needed +* + CALL ZLACPY( 'U', N, N, A, LDA, CWORK(N+1), N ) + IF ( L2PERT ) THEN + XSC = SQRT(SMALL) + DO 5970 p = 2, N + CTEMP = XSC * CWORK( N + (p-1)*N + p ) + DO 5971 q = 1, p - 1 +* CWORK(N+(q-1)*N+p)=-TEMP1 * ( CWORK(N+(p-1)*N+q) / +* $ ABS(CWORK(N+(p-1)*N+q)) ) + CWORK(N+(q-1)*N+p)=-CTEMP + 5971 CONTINUE + 5970 CONTINUE + ELSE + CALL ZLASET( 'L',N-1,N-1,CZERO,CZERO,CWORK(N+2),N ) + END IF +* + CALL ZGESVJ( 'U', 'U', 'N', N, N, CWORK(N+1), N, SVA, + $ N, U, LDU, CWORK(N+N*N+1), LWORK-N-N*N, RWORK, LRWORK, + $ INFO ) +* + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + DO 6970 p = 1, N + CALL ZCOPY( N, CWORK(N+(p-1)*N+1), 1, U(1,p), 1 ) + CALL ZDSCAL( N, SVA(p), CWORK(N+(p-1)*N+1), 1 ) + 6970 CONTINUE +* + CALL ZTRSM( 'L', 'U', 'N', 'N', N, N, + $ CONE, A, LDA, CWORK(N+1), N ) + DO 6972 p = 1, N + CALL ZCOPY( N, CWORK(N+p), N, V(IWORK(p),1), LDV ) + 6972 CONTINUE + TEMP1 = SQRT(DBLE(N))*EPSLN + DO 6971 p = 1, N + XSC = ONE / DZNRM2( N, V(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL ZDSCAL( N, XSC, V(1,p), 1 ) + 6971 CONTINUE +* +* Assemble the left singular vector matrix U (M x N). +* + IF ( N .LT. M ) THEN + CALL ZLASET( 'A', M-N, N, CZERO, CZERO, U(N+1,1), LDU ) + IF ( N .LT. N1 ) THEN + CALL ZLASET('A',N, N1-N, CZERO, CZERO, U(1,N+1),LDU) + CALL ZLASET( 'A',M-N,N1-N, CZERO, CONE,U(N+1,N+1),LDU) + END IF + END IF + CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) + TEMP1 = SQRT(DBLE(M))*EPSLN + DO 6973 p = 1, N1 + XSC = ONE / DZNRM2( M, U(1,p), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL ZDSCAL( M, XSC, U(1,p), 1 ) + 6973 CONTINUE +* + IF ( ROWPIV ) + $ CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* + END IF +* +* end of the >> almost orthogonal case << in the full SVD +* + ELSE +* +* This branch deploys a preconditioned Jacobi SVD with explicitly +* accumulated rotations. It is included as optional, mainly for +* experimental purposes. It does perform well, and can also be used. +* In this implementation, this branch will be automatically activated +* if the condition number sigma_max(A) / sigma_min(A) is predicted +* to be greater than the overflow threshold. This is because the +* a posteriori computation of the singular vectors assumes robust +* implementation of BLAS and some LAPACK procedures, capable of working +* in presence of extreme values, e.g. when the singular values spread from +* the underflow to the overflow threshold. +* + DO 7968 p = 1, NR + CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 ) + CALL ZLACGV( N-p+1, V(p,p), 1 ) + 7968 CONTINUE +* + IF ( L2PERT ) THEN + XSC = SQRT(SMALL/EPSLN) + DO 5969 q = 1, NR + CTEMP = DCMPLX(XSC*ABS( V(q,q) ),ZERO) + DO 5968 p = 1, N + IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 ) + $ .OR. ( p .LT. q ) ) +* $ V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) ) + $ V(p,q) = CTEMP + IF ( p .LT. q ) V(p,q) = - V(p,q) + 5968 CONTINUE + 5969 CONTINUE + ELSE + CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV ) + END IF + + CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1), + $ LWORK-2*N, IERR ) + CALL ZLACPY( 'L', N, NR, V, LDV, CWORK(2*N+1), N ) +* + DO 7969 p = 1, NR + CALL ZCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 ) + CALL ZLACGV( NR-p+1, U(p,p), 1 ) + 7969 CONTINUE + + IF ( L2PERT ) THEN + XSC = SQRT(SMALL/EPSLN) + DO 9970 q = 2, NR + DO 9971 p = 1, q - 1 + CTEMP = DCMPLX(XSC * MIN(ABS(U(p,p)),ABS(U(q,q))), + $ ZERO) +* U(p,q) = - TEMP1 * ( U(q,p) / ABS(U(q,p)) ) + U(p,q) = - CTEMP + 9971 CONTINUE + 9970 CONTINUE + ELSE + CALL ZLASET('U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU ) + END IF + + CALL ZGESVJ( 'L', 'U', 'V', NR, NR, U, LDU, SVA, + $ N, V, LDV, CWORK(2*N+N*NR+1), LWORK-2*N-N*NR, + $ RWORK, LRWORK, INFO ) + SCALEM = RWORK(1) + NUMRANK = NINT(RWORK(2)) + + IF ( NR .LT. N ) THEN + CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV ) + CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV ) + CALL ZLASET( 'A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV ) + END IF + + CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1), + $ V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR ) +* +* Permute the rows of V using the (column) permutation from the +* first QRF. Also, scale the columns to make them unit in +* Euclidean norm. This applies to all cases. +* + TEMP1 = SQRT(DBLE(N)) * EPSLN + DO 7972 q = 1, N + DO 8972 p = 1, N + CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q) + 8972 CONTINUE + DO 8973 p = 1, N + V(p,q) = CWORK(2*N+N*NR+NR+p) + 8973 CONTINUE + XSC = ONE / DZNRM2( N, V(1,q), 1 ) + IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) ) + $ CALL ZDSCAL( N, XSC, V(1,q), 1 ) + 7972 CONTINUE +* +* At this moment, V contains the right singular vectors of A. +* Next, assemble the left singular vector matrix U (M x N). +* + IF ( NR .LT. M ) THEN + CALL ZLASET( 'A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU ) + IF ( NR .LT. N1 ) THEN + CALL ZLASET('A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU) + CALL ZLASET('A',M-NR,N1-NR, CZERO, CONE,U(NR+1,NR+1),LDU) + END IF + END IF +* + CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U, + $ LDU, CWORK(N+1), LWORK-N, IERR ) +* + IF ( ROWPIV ) + $ CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 ) +* +* + END IF + IF ( TRANSP ) THEN +* .. swap U and V because the procedure worked on A^* + DO 6974 p = 1, N + CALL ZSWAP( N, U(1,p), 1, V(1,p), 1 ) + 6974 CONTINUE + END IF +* + END IF +* end of the full SVD +* +* Undo scaling, if necessary (and possible) +* + IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN + CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR ) + USCAL1 = ONE + USCAL2 = ONE + END IF +* + IF ( NR .LT. N ) THEN + DO 3004 p = NR+1, N + SVA(p) = ZERO + 3004 CONTINUE + END IF +* + RWORK(1) = USCAL2 * SCALEM + RWORK(2) = USCAL1 + IF ( ERREST ) RWORK(3) = SCONDA + IF ( LSVEC .AND. RSVEC ) THEN + RWORK(4) = CONDR1 + RWORK(5) = CONDR2 + END IF + IF ( L2TRAN ) THEN + RWORK(6) = ENTRA + RWORK(7) = ENTRAT + END IF +* + IWORK(1) = NR + IWORK(2) = NUMRANK + IWORK(3) = WARNING + IF ( TRANSP ) THEN + IWORK(4) = 1 + ELSE + IWORK(4) = -1 + END IF + +* + RETURN +* .. +* .. END OF ZGEJSV +* .. + END +* diff --git a/lapack-netlib/zgesvx.f b/lapack-netlib/zgesvx.f new file mode 100644 index 000000000..3b193a1b2 --- /dev/null +++ b/lapack-netlib/zgesvx.f @@ -0,0 +1,602 @@ +*> \brief ZGESVX computes the solution to system of linear equations A * X = B for GE matrices +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGESVX + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, +* EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, +* WORK, RWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER EQUED, FACT, TRANS +* INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS +* DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. +* INTEGER IPIV( * ) +* DOUBLE PRECISION BERR( * ), C( * ), FERR( * ), R( * ), +* $ RWORK( * ) +* COMPLEX*16 A( LDA, * ), AF( LDAF, * ), B( LDB, * ), +* $ WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGESVX uses the LU factorization to compute the solution to a complex +*> system of linear equations +*> A * X = B, +*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +*> +*> Error bounds on the solution and a condition estimate are also +*> provided. +*> \endverbatim +* +*> \par Description: +* ================= +*> +*> \verbatim +*> +*> The following steps are performed: +*> +*> 1. If FACT = 'E', real scaling factors are computed to equilibrate +*> the system: +*> TRANS = 'N': diag(R)*A*diag(C) *inv(diag(C))*X = diag(R)*B +*> TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B +*> TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B +*> Whether or not the system will be equilibrated depends on the +*> scaling of the matrix A, but if equilibration is used, A is +*> overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N') +*> or diag(C)*B (if TRANS = 'T' or 'C'). +*> +*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the +*> matrix A (after equilibration if FACT = 'E') as +*> A = P * L * U, +*> where P is a permutation matrix, L is a unit lower triangular +*> matrix, and U is upper triangular. +*> +*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine +*> returns with INFO = i. Otherwise, the factored form of A is used +*> to estimate the condition number of the matrix A. If the +*> reciprocal of the condition number is less than machine precision, +*> INFO = N+1 is returned as a warning, but the routine still goes on +*> to solve for X and compute error bounds as described below. +*> +*> 4. The system of equations is solved for X using the factored form +*> of A. +*> +*> 5. Iterative refinement is applied to improve the computed solution +*> matrix and calculate error bounds and backward error estimates +*> for it. +*> +*> 6. If equilibration was used, the matrix X is premultiplied by +*> diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so +*> that it solves the original system before equilibration. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] FACT +*> \verbatim +*> FACT is CHARACTER*1 +*> Specifies whether or not the factored form of the matrix A is +*> supplied on entry, and if not, whether the matrix A should be +*> equilibrated before it is factored. +*> = 'F': On entry, AF and IPIV contain the factored form of A. +*> If EQUED is not 'N', the matrix A has been +*> equilibrated with scaling factors given by R and C. +*> A, AF, and IPIV are not modified. +*> = 'N': The matrix A will be copied to AF and factored. +*> = 'E': The matrix A will be equilibrated if necessary, then +*> copied to AF and factored. +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the form of the system of equations: +*> = 'N': A * X = B (No transpose) +*> = 'T': A**T * X = B (Transpose) +*> = 'C': A**H * X = B (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of linear equations, i.e., the order of the +*> matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of columns +*> of the matrices B and X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the N-by-N matrix A. If FACT = 'F' and EQUED is +*> not 'N', then A must have been equilibrated by the scaling +*> factors in R and/or C. A is not modified if FACT = 'F' or +*> 'N', or if FACT = 'E' and EQUED = 'N' on exit. +*> +*> On exit, if EQUED .ne. 'N', A is scaled as follows: +*> EQUED = 'R': A := diag(R) * A +*> EQUED = 'C': A := A * diag(C) +*> EQUED = 'B': A := diag(R) * A * diag(C). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] AF +*> \verbatim +*> AF is COMPLEX*16 array, dimension (LDAF,N) +*> If FACT = 'F', then AF is an input argument and on entry +*> contains the factors L and U from the factorization +*> A = P*L*U as computed by ZGETRF. If EQUED .ne. 'N', then +*> AF is the factored form of the equilibrated matrix A. +*> +*> If FACT = 'N', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then AF is an output argument and on exit +*> returns the factors L and U from the factorization A = P*L*U +*> of the equilibrated matrix A (see the description of A for +*> the form of the equilibrated matrix). +*> \endverbatim +*> +*> \param[in] LDAF +*> \verbatim +*> LDAF is INTEGER +*> The leading dimension of the array AF. LDAF >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] IPIV +*> \verbatim +*> IPIV is INTEGER array, dimension (N) +*> If FACT = 'F', then IPIV is an input argument and on entry +*> contains the pivot indices from the factorization A = P*L*U +*> as computed by ZGETRF; row i of the matrix was interchanged +*> with row IPIV(i). +*> +*> If FACT = 'N', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the original matrix A. +*> +*> If FACT = 'E', then IPIV is an output argument and on exit +*> contains the pivot indices from the factorization A = P*L*U +*> of the equilibrated matrix A. +*> \endverbatim +*> +*> \param[in,out] EQUED +*> \verbatim +*> EQUED is CHARACTER*1 +*> Specifies the form of equilibration that was done. +*> = 'N': No equilibration (always true if FACT = 'N'). +*> = 'R': Row equilibration, i.e., A has been premultiplied by +*> diag(R). +*> = 'C': Column equilibration, i.e., A has been postmultiplied +*> by diag(C). +*> = 'B': Both row and column equilibration, i.e., A has been +*> replaced by diag(R) * A * diag(C). +*> EQUED is an input argument if FACT = 'F'; otherwise, it is an +*> output argument. +*> \endverbatim +*> +*> \param[in,out] R +*> \verbatim +*> R is DOUBLE PRECISION array, dimension (N) +*> The row scale factors for A. If EQUED = 'R' or 'B', A is +*> multiplied on the left by diag(R); if EQUED = 'N' or 'C', R +*> is not accessed. R is an input argument if FACT = 'F'; +*> otherwise, R is an output argument. If FACT = 'F' and +*> EQUED = 'R' or 'B', each element of R must be positive. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (N) +*> The column scale factors for A. If EQUED = 'C' or 'B', A is +*> multiplied on the right by diag(C); if EQUED = 'N' or 'R', C +*> is not accessed. C is an input argument if FACT = 'F'; +*> otherwise, C is an output argument. If FACT = 'F' and +*> EQUED = 'C' or 'B', each element of C must be positive. +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,NRHS) +*> On entry, the N-by-NRHS right hand side matrix B. +*> On exit, +*> if EQUED = 'N', B is not modified; +*> if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by +*> diag(R)*B; +*> if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is +*> overwritten by diag(C)*B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension (LDX,NRHS) +*> If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X +*> to the original system of equations. Note that A and B are +*> modified on exit if EQUED .ne. 'N', and the solution to the +*> equilibrated system is inv(diag(C))*X if TRANS = 'N' and +*> EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C' +*> and EQUED = 'R' or 'B'. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max(1,N). +*> \endverbatim +*> +*> \param[out] RCOND +*> \verbatim +*> RCOND is DOUBLE PRECISION +*> The estimate of the reciprocal condition number of the matrix +*> A after equilibration (if done). If RCOND is less than the +*> machine precision (in particular, if RCOND = 0), the matrix +*> is singular to working precision. This condition is +*> indicated by a return code of INFO > 0. +*> \endverbatim +*> +*> \param[out] FERR +*> \verbatim +*> FERR is DOUBLE PRECISION array, dimension (NRHS) +*> The estimated forward error bound for each solution vector +*> X(j) (the j-th column of the solution matrix X). +*> If XTRUE is the true solution corresponding to X(j), FERR(j) +*> is an estimated upper bound for the magnitude of the largest +*> element in (X(j) - XTRUE) divided by the magnitude of the +*> largest element in X(j). The estimate is as reliable as +*> the estimate for RCOND, and is almost always a slight +*> overestimate of the true error. +*> \endverbatim +*> +*> \param[out] BERR +*> \verbatim +*> BERR is DOUBLE PRECISION array, dimension (NRHS) +*> The componentwise relative backward error of each solution +*> vector X(j) (i.e., the smallest relative change in +*> any element of A or B that makes X(j) an exact solution). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,2*N)) +*> On exit, RWORK(1) contains the reciprocal pivot growth +*> factor norm(A)/norm(U). The "max absolute element" norm is +*> used. If RWORK(1) is much less than 1, then the stability +*> of the LU factorization of the (equilibrated) matrix A +*> could be poor. This also means that the solution X, condition +*> estimator RCOND, and forward error bound FERR could be +*> unreliable. If factorization fails with 0 RWORK(1) contains the reciprocal pivot growth factor for the +*> leading INFO columns of A. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, and i is +*> <= N: U(i,i) is exactly zero. The factorization has +*> been completed, but the factor U is exactly +*> singular, so the solution and error bounds +*> could not be computed. RCOND = 0 is returned. +*> = N+1: U is nonsingular, but RCOND is less than machine +*> precision, meaning that the matrix is singular +*> to working precision. Nevertheless, the +*> solution and error bounds are computed because +*> there are a number of situations where the +*> computed solution can be more accurate than the +*> value of RCOND would suggest. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16GEsolve +* +* ===================================================================== + SUBROUTINE ZGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, + $ EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR, + $ WORK, RWORK, INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER EQUED, FACT, TRANS + INTEGER INFO, LDA, LDAF, LDB, LDX, N, NRHS + DOUBLE PRECISION RCOND +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION BERR( * ), C( * ), FERR( * ), R( * ), + $ RWORK( * ) + COMPLEX*16 A( LDA, * ), AF( LDAF, * ), B( LDB, * ), + $ WORK( * ), X( LDX, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU + CHARACTER NORM + INTEGER I, INFEQU, J + DOUBLE PRECISION AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN, + $ ROWCND, RPVGRW, SMLNUM +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DLAMCH, ZLANGE, ZLANTR + EXTERNAL LSAME, DLAMCH, ZLANGE, ZLANTR +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGECON, ZGEEQU, ZGERFS, ZGETRF, ZGETRS, + $ ZLACPY, ZLAQGE +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + NOFACT = LSAME( FACT, 'N' ) + EQUIL = LSAME( FACT, 'E' ) + NOTRAN = LSAME( TRANS, 'N' ) + IF( NOFACT .OR. EQUIL ) THEN + EQUED = 'N' + ROWEQU = .FALSE. + COLEQU = .FALSE. + ELSE + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + SMLNUM = DLAMCH( 'Safe minimum' ) + BIGNUM = ONE / SMLNUM + END IF +* +* Test the input parameters. +* + IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) ) + $ THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -6 + ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT. + $ ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN + INFO = -10 + ELSE + IF( ROWEQU ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 10 J = 1, N + RCMIN = MIN( RCMIN, R( J ) ) + RCMAX = MAX( RCMAX, R( J ) ) + 10 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -11 + ELSE IF( N.GT.0 ) THEN + ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + ROWCND = ONE + END IF + END IF + IF( COLEQU .AND. INFO.EQ.0 ) THEN + RCMIN = BIGNUM + RCMAX = ZERO + DO 20 J = 1, N + RCMIN = MIN( RCMIN, C( J ) ) + RCMAX = MAX( RCMAX, C( J ) ) + 20 CONTINUE + IF( RCMIN.LE.ZERO ) THEN + INFO = -12 + ELSE IF( N.GT.0 ) THEN + COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM ) + ELSE + COLCND = ONE + END IF + END IF + IF( INFO.EQ.0 ) THEN + IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -14 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -16 + END IF + END IF + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGESVX', -INFO ) + RETURN + END IF +* + IF( EQUIL ) THEN +* +* Compute row and column scalings to equilibrate the matrix A. +* + CALL ZGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU ) + IF( INFEQU.EQ.0 ) THEN +* +* Equilibrate the matrix. +* + CALL ZLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, + $ EQUED ) + ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' ) + COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' ) + END IF + END IF +* +* Scale the right hand side. +* + IF( NOTRAN ) THEN + IF( ROWEQU ) THEN + DO 40 J = 1, NRHS + DO 30 I = 1, N + B( I, J ) = R( I )*B( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE IF( COLEQU ) THEN + DO 60 J = 1, NRHS + DO 50 I = 1, N + B( I, J ) = C( I )*B( I, J ) + 50 CONTINUE + 60 CONTINUE + END IF +* + IF( NOFACT .OR. EQUIL ) THEN +* +* Compute the LU factorization of A. +* + CALL ZLACPY( 'Full', N, N, A, LDA, AF, LDAF ) + CALL ZGETRF( N, N, AF, LDAF, IPIV, INFO ) +* +* Return if INFO is non-zero. +* + IF( INFO.GT.0 ) THEN +* +* Compute the reciprocal pivot growth factor of the +* leading rank-deficient INFO columns of A. +* + RPVGRW = ZLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF, + $ RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ZLANGE( 'M', N, INFO, A, LDA, RWORK ) / + $ RPVGRW + END IF + RWORK( 1 ) = RPVGRW + RCOND = ZERO + RETURN + END IF + END IF +* +* Compute the norm of the matrix A and the +* reciprocal pivot growth factor RPVGRW. +* + IF( NOTRAN ) THEN + NORM = '1' + ELSE + NORM = 'I' + END IF + ANORM = ZLANGE( NORM, N, N, A, LDA, RWORK ) + RPVGRW = ZLANTR( 'M', 'U', 'N', N, N, AF, LDAF, RWORK ) + IF( RPVGRW.EQ.ZERO ) THEN + RPVGRW = ONE + ELSE + RPVGRW = ZLANGE( 'M', N, N, A, LDA, RWORK ) / RPVGRW + END IF +* +* Compute the reciprocal of the condition number of A. +* + CALL ZGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, RWORK, INFO ) +* +* Compute the solution matrix X. +* + CALL ZLACPY( 'Full', N, NRHS, B, LDB, X, LDX ) + CALL ZGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO ) +* +* Use iterative refinement to improve the computed solution and +* compute error bounds and backward error estimates for it. +* + CALL ZGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X, + $ LDX, FERR, BERR, WORK, RWORK, INFO ) +* +* Transform the solution matrix X to a solution of the original +* system. +* + IF( NOTRAN ) THEN + IF( COLEQU ) THEN + DO 80 J = 1, NRHS + DO 70 I = 1, N + X( I, J ) = C( I )*X( I, J ) + 70 CONTINUE + 80 CONTINUE + DO 90 J = 1, NRHS + FERR( J ) = FERR( J ) / COLCND + 90 CONTINUE + END IF + ELSE IF( ROWEQU ) THEN + DO 110 J = 1, NRHS + DO 100 I = 1, N + X( I, J ) = R( I )*X( I, J ) + 100 CONTINUE + 110 CONTINUE + DO 120 J = 1, NRHS + FERR( J ) = FERR( J ) / ROWCND + 120 CONTINUE + END IF +* +* Set INFO = N+1 if the matrix is singular to working precision. +* + IF( RCOND.LT.DLAMCH( 'Epsilon' ) ) + $ INFO = N + 1 +* + RWORK( 1 ) = RPVGRW + RETURN +* +* End of ZGESVX +* + END From 02efa8d6be38db3810d955ebe0b11769eb338e28 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 16 May 2023 20:01:47 +0200 Subject: [PATCH 072/718] Fix typos in comments (Reference-LAPACK 811) --- lapack-netlib/SRC/cgejsv.f | 18 +++++++++--------- lapack-netlib/SRC/claswlq.f | 1 + lapack-netlib/SRC/clatsqr.f | 1 + lapack-netlib/SRC/cuncsd2by1.f | 4 ++-- lapack-netlib/SRC/cungtsqr.f | 3 ++- lapack-netlib/SRC/dlaswlq.f | 1 + lapack-netlib/SRC/dlatsqr.f | 1 + lapack-netlib/SRC/dorgtsqr.f | 3 ++- lapack-netlib/SRC/slaswlq.f | 1 + lapack-netlib/SRC/slatsqr.f | 1 + lapack-netlib/SRC/sorgtsqr.f | 3 ++- lapack-netlib/SRC/zgejsv.f | 18 +++++++++--------- lapack-netlib/SRC/zlaswlq.f | 1 + lapack-netlib/SRC/zlatsqr.f | 1 + lapack-netlib/SRC/zuncsd2by1.f | 4 ++-- lapack-netlib/SRC/zungtsqr.f | 3 ++- 16 files changed, 38 insertions(+), 26 deletions(-) diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index 062ac182b..8fe4159f6 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -52,10 +52,10 @@ *> are computed and stored in the arrays U and V, respectively. The diagonal *> of [SIGMA] is computed and stored in the array SVA. *> \endverbatim -*> -*> Arguments: -*> ========== -*> +* +* Arguments: +* ========== +* *> \param[in] JOBA *> \verbatim *> JOBA is CHARACTER*1 @@ -151,7 +151,7 @@ *> transposed A if A^* seems to be better with respect to convergence. *> If the matrix is not square, JOBT is ignored. *> The decision is based on two values of entropy over the adjoint -*> orbit of A^* * A. See the descriptions of WORK(6) and WORK(7). +*> orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7). *> = 'T': transpose if entropy test indicates possibly faster *> convergence of Jacobi process if A^* is taken as input. If A is *> replaced with A^*, then the row pivoting is included automatically. @@ -209,11 +209,11 @@ *> \verbatim *> SVA is REAL array, dimension (N) *> On exit, -*> - For WORK(1)/WORK(2) = ONE: The singular values of A. During the -*> computation SVA contains Euclidean column norms of the +*> - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During +*> the computation SVA contains Euclidean column norms of the *> iterated matrices in the array A. -*> - For WORK(1) .NE. WORK(2): The singular values of A are -*> (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if +*> - For RWORK(1) .NE. RWORK(2): The singular values of A are +*> (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if *> sigma_max(A) overflows or if small singular values have been *> saved from underflow by scaling the input matrix A. *> - If JOBR='R' then some of the singular values may be returned diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f index 63cbd02c6..1a09b8305 100644 --- a/lapack-netlib/SRC/claswlq.f +++ b/lapack-netlib/SRC/claswlq.f @@ -104,6 +104,7 @@ *> \endverbatim *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= MB*M. *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f index 7a0b85487..377190081 100644 --- a/lapack-netlib/SRC/clatsqr.f +++ b/lapack-netlib/SRC/clatsqr.f @@ -106,6 +106,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= NB*N. *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/cuncsd2by1.f b/lapack-netlib/SRC/cuncsd2by1.f index 8360d5932..f0c44f670 100644 --- a/lapack-netlib/SRC/cuncsd2by1.f +++ b/lapack-netlib/SRC/cuncsd2by1.f @@ -212,13 +212,13 @@ *> LRWORK is INTEGER *> The dimension of the array RWORK. *> -*> If LRWORK = -1, then a workspace query is assumed; the routine +*> If LRWORK=-1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK and RWORK *> arrays, returns this value as the first entry of the WORK *> and RWORK array, respectively, and no error message related *> to LWORK or LRWORK is issued by XERBLA. *> \endverbatim -* +*> *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (M-MIN(P,M-P,Q,M-Q)) diff --git a/lapack-netlib/SRC/cungtsqr.f b/lapack-netlib/SRC/cungtsqr.f index 64ccb9731..1734be2aa 100644 --- a/lapack-netlib/SRC/cungtsqr.f +++ b/lapack-netlib/SRC/cungtsqr.f @@ -133,6 +133,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= (M+NB)*N. *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK @@ -302,4 +303,4 @@ * * End of CUNGTSQR * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f index fb8857145..c95c94cbc 100644 --- a/lapack-netlib/SRC/dlaswlq.f +++ b/lapack-netlib/SRC/dlaswlq.f @@ -104,6 +104,7 @@ *> \endverbatim *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= MB*M. *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f index f5cbb76bb..94a04be02 100644 --- a/lapack-netlib/SRC/dlatsqr.f +++ b/lapack-netlib/SRC/dlatsqr.f @@ -106,6 +106,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= NB*N. *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/dorgtsqr.f b/lapack-netlib/SRC/dorgtsqr.f index 34d96b238..2e5cd40be 100644 --- a/lapack-netlib/SRC/dorgtsqr.f +++ b/lapack-netlib/SRC/dorgtsqr.f @@ -133,6 +133,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= (M+NB)*N. *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK @@ -301,4 +302,4 @@ * * End of DORGTSQR * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f index bc7cf343d..95e0ddcce 100644 --- a/lapack-netlib/SRC/slaswlq.f +++ b/lapack-netlib/SRC/slaswlq.f @@ -104,6 +104,7 @@ *> \endverbatim *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= MB * M. *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f index 33966c01f..86733bb15 100644 --- a/lapack-netlib/SRC/slatsqr.f +++ b/lapack-netlib/SRC/slatsqr.f @@ -106,6 +106,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= NB*N. *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/sorgtsqr.f b/lapack-netlib/SRC/sorgtsqr.f index a755fa4a4..692eba1d9 100644 --- a/lapack-netlib/SRC/sorgtsqr.f +++ b/lapack-netlib/SRC/sorgtsqr.f @@ -133,6 +133,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= (M+NB)*N. *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK @@ -301,4 +302,4 @@ * * End of SORGTSQR * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index 5134ea891..464c4a0fa 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -52,10 +52,10 @@ *> are computed and stored in the arrays U and V, respectively. The diagonal *> of [SIGMA] is computed and stored in the array SVA. *> \endverbatim -*> -*> Arguments: -*> ========== -*> +* +* Arguments: +* ========== +* *> \param[in] JOBA *> \verbatim *> JOBA is CHARACTER*1 @@ -151,7 +151,7 @@ *> transposed A if A^* seems to be better with respect to convergence. *> If the matrix is not square, JOBT is ignored. *> The decision is based on two values of entropy over the adjoint -*> orbit of A^* * A. See the descriptions of WORK(6) and WORK(7). +*> orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7). *> = 'T': transpose if entropy test indicates possibly faster *> convergence of Jacobi process if A^* is taken as input. If A is *> replaced with A^*, then the row pivoting is included automatically. @@ -209,11 +209,11 @@ *> \verbatim *> SVA is DOUBLE PRECISION array, dimension (N) *> On exit, -*> - For WORK(1)/WORK(2) = ONE: The singular values of A. During the -*> computation SVA contains Euclidean column norms of the +*> - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During +*> the computation SVA contains Euclidean column norms of the *> iterated matrices in the array A. -*> - For WORK(1) .NE. WORK(2): The singular values of A are -*> (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if +*> - For RWORK(1) .NE. RWORK(2): The singular values of A are +*> (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if *> sigma_max(A) overflows or if small singular values have been *> saved from underflow by scaling the input matrix A. *> - If JOBR='R' then some of the singular values may be returned diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f index e4e703343..be4c48539 100644 --- a/lapack-netlib/SRC/zlaswlq.f +++ b/lapack-netlib/SRC/zlaswlq.f @@ -104,6 +104,7 @@ *> \endverbatim *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= MB*M. *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f index ffdbc68c0..8c938aebc 100644 --- a/lapack-netlib/SRC/zlatsqr.f +++ b/lapack-netlib/SRC/zlatsqr.f @@ -106,6 +106,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= NB*N. *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/zuncsd2by1.f b/lapack-netlib/SRC/zuncsd2by1.f index 2d861c2cc..399b598be 100644 --- a/lapack-netlib/SRC/zuncsd2by1.f +++ b/lapack-netlib/SRC/zuncsd2by1.f @@ -211,13 +211,13 @@ *> LRWORK is INTEGER *> The dimension of the array RWORK. *> -*> If LRWORK = -1, then a workspace query is assumed; the routine +*> If LRWORK=-1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK and RWORK *> arrays, returns this value as the first entry of the WORK *> and RWORK array, respectively, and no error message related *> to LWORK or LRWORK is issued by XERBLA. *> \endverbatim -* +*> *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (M-MIN(P,M-P,Q,M-Q)) diff --git a/lapack-netlib/SRC/zungtsqr.f b/lapack-netlib/SRC/zungtsqr.f index 4f2d7dfdd..23e28ac5c 100644 --- a/lapack-netlib/SRC/zungtsqr.f +++ b/lapack-netlib/SRC/zungtsqr.f @@ -133,6 +133,7 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= (M+NB)*N. *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK @@ -302,4 +303,4 @@ * * End of ZUNGTSQR * - END \ No newline at end of file + END From 5fbd5f531bcd074603654948dca4ec771a25a833 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 16 May 2023 20:05:05 +0200 Subject: [PATCH 073/718] Fix typo in description of VR argument (Reference-LAPACK 812) --- lapack-netlib/SRC/ctgevc.f | 7 +++---- lapack-netlib/SRC/ztgevc.f | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/ctgevc.f b/lapack-netlib/SRC/ctgevc.f index 4e5289cb2..22144f259 100644 --- a/lapack-netlib/SRC/ctgevc.f +++ b/lapack-netlib/SRC/ctgevc.f @@ -53,7 +53,7 @@ *> *> S*x = w*P*x, (y**H)*S = w*(y**H)*P, *> -*> where y**H denotes the conjugate tranpose of y. +*> where y**H denotes the conjugate transpose of y. *> The eigenvalues are not input to this routine, but are computed *> directly from the diagonal elements of S and P. *> @@ -154,7 +154,7 @@ *> \verbatim *> VR is COMPLEX array, dimension (LDVR,MM) *> On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must -*> contain an N-by-N matrix Q (usually the unitary matrix Z +*> contain an N-by-N matrix Z (usually the unitary matrix Z *> of right Schur vectors returned by CHGEQZ). *> On exit, if SIDE = 'R' or 'B', VR contains: *> if HOWMNY = 'A', the matrix X of right eigenvectors of (S,P); @@ -259,7 +259,7 @@ EXTERNAL LSAME, SLAMCH, CLADIV * .. * .. External Subroutines .. - EXTERNAL CGEMV, SLABAD, XERBLA + EXTERNAL CGEMV, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL @@ -367,7 +367,6 @@ * SAFMIN = SLAMCH( 'Safe minimum' ) BIG = ONE / SAFMIN - CALL SLABAD( SAFMIN, BIG ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) SMALL = SAFMIN*N / ULP BIG = ONE / SMALL diff --git a/lapack-netlib/SRC/ztgevc.f b/lapack-netlib/SRC/ztgevc.f index 23bd36ddb..6dcbe582f 100644 --- a/lapack-netlib/SRC/ztgevc.f +++ b/lapack-netlib/SRC/ztgevc.f @@ -53,7 +53,7 @@ *> *> S*x = w*P*x, (y**H)*S = w*(y**H)*P, *> -*> where y**H denotes the conjugate tranpose of y. +*> where y**H denotes the conjugate transpose of y. *> The eigenvalues are not input to this routine, but are computed *> directly from the diagonal elements of S and P. *> @@ -154,7 +154,7 @@ *> \verbatim *> VR is COMPLEX*16 array, dimension (LDVR,MM) *> On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must -*> contain an N-by-N matrix Q (usually the unitary matrix Z +*> contain an N-by-N matrix Z (usually the unitary matrix Z *> of right Schur vectors returned by ZHGEQZ). *> On exit, if SIDE = 'R' or 'B', VR contains: *> if HOWMNY = 'A', the matrix X of right eigenvectors of (S,P); @@ -259,7 +259,7 @@ EXTERNAL LSAME, DLAMCH, ZLADIV * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEMV + EXTERNAL XERBLA, ZGEMV * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN @@ -367,7 +367,6 @@ * SAFMIN = DLAMCH( 'Safe minimum' ) BIG = ONE / SAFMIN - CALL DLABAD( SAFMIN, BIG ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) SMALL = SAFMIN*N / ULP BIG = ONE / SMALL From a82c1443dbf40e4c46ed3158c6945a3cdb28d3cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 May 2023 14:33:46 +0200 Subject: [PATCH 074/718] Fix typos in comments (Reference-LAPACK PR 814) --- lapack-netlib/SRC/cgejsv.f | 6 +++--- lapack-netlib/SRC/cgsvj0.f | 6 +++--- lapack-netlib/SRC/cgsvj1.f | 6 +++--- lapack-netlib/SRC/clalsa.f | 4 ++-- lapack-netlib/SRC/cstegr.f | 2 +- lapack-netlib/SRC/ctgsen.f | 2 +- lapack-netlib/SRC/dgejsv.f | 8 ++++---- lapack-netlib/SRC/dgsvj0.f | 6 +++--- lapack-netlib/SRC/dgsvj1.f | 6 +++--- lapack-netlib/SRC/dlalsa.f | 4 ++-- lapack-netlib/SRC/dlarre.f | 2 +- lapack-netlib/SRC/dstegr.f | 2 +- lapack-netlib/SRC/dtgevc.f | 5 ++--- lapack-netlib/SRC/iparam2stage.F | 4 ++-- lapack-netlib/SRC/sgejsv.f | 6 +++--- lapack-netlib/SRC/sgsvj0.f | 6 +++--- lapack-netlib/SRC/sgsvj1.f | 6 +++--- lapack-netlib/SRC/slalsa.f | 4 ++-- lapack-netlib/SRC/slarre.f | 2 +- lapack-netlib/SRC/sstegr.f | 2 +- lapack-netlib/SRC/stgevc.f | 5 ++--- lapack-netlib/SRC/zgejsv.f | 6 +++--- lapack-netlib/SRC/zgsvj0.f | 6 +++--- lapack-netlib/SRC/zgsvj1.f | 6 +++--- lapack-netlib/SRC/zlalsa.f | 4 ++-- lapack-netlib/SRC/zstegr.f | 2 +- lapack-netlib/SRC/ztgsy2.f | 2 +- 27 files changed, 59 insertions(+), 61 deletions(-) diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index 8fe4159f6..2b7b2dc70 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -1819,7 +1819,7 @@ IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 * (this overwrites the copy of R2, as it will not be -* needed in this branch, but it does not overwritte the +* needed in this branch, but it does not overwrite the * Huseholder vectors of Q2.). CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N ) * .. and the rest of the information on Q3 is in @@ -1842,7 +1842,7 @@ END IF * * Second preconditioning finished; continue with Jacobi SVD -* The input matrix is lower trinagular. +* The input matrix is lower triangular. * * Recover the right singular vectors as solution of a well * conditioned triangular matrix equation. @@ -1886,7 +1886,7 @@ ELSE IF ( CONDR2 .LT. COND_OK ) THEN * * The matrix R2 is inverted. The solution of the matrix equation -* is Q3^* * V3 = the product of the Jacobi rotations (appplied to +* is Q3^* * V3 = the product of the Jacobi rotations (applied to * the lower triangular L3 from the LQ factorization of * R2=L3*Q3), pre-multiplied with the transposed Q3. CALL CGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, diff --git a/lapack-netlib/SRC/cgsvj0.f b/lapack-netlib/SRC/cgsvj0.f index 37853a10a..9e6053013 100644 --- a/lapack-netlib/SRC/cgsvj0.f +++ b/lapack-netlib/SRC/cgsvj0.f @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/cgsvj1.f b/lapack-netlib/SRC/cgsvj1.f index f1a5204e3..6d68e920d 100644 --- a/lapack-netlib/SRC/cgsvj1.f +++ b/lapack-netlib/SRC/cgsvj1.f @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/clalsa.f b/lapack-netlib/SRC/clalsa.f index 06883ab20..fceb380f5 100644 --- a/lapack-netlib/SRC/clalsa.f +++ b/lapack-netlib/SRC/clalsa.f @@ -42,9 +42,9 @@ *> *> \verbatim *> -*> CLALSA is an itermediate step in solving the least squares problem +*> CLALSA is an intermediate step in solving the least squares problem *> by computing the SVD of the coefficient matrix in compact form (The -*> singular vectors are computed as products of simple orthorgonal +*> singular vectors are computed as products of simple orthogonal *> matrices.). *> *> If ICOMPQ = 0, CLALSA applies the inverse of the left singular vector diff --git a/lapack-netlib/SRC/cstegr.f b/lapack-netlib/SRC/cstegr.f index 9d6e06da3..a162d5076 100644 --- a/lapack-netlib/SRC/cstegr.f +++ b/lapack-netlib/SRC/cstegr.f @@ -56,7 +56,7 @@ *> *> Note : CSTEGR and CSTEMR work only on machines which follow *> IEEE-754 floating-point standard in their handling of infinities and -*> NaNs. Normal execution may create these exceptiona values and hence +*> NaNs. Normal execution may create these exceptional values and hence *> may abort due to a floating point exception in environments which *> do not conform to the IEEE-754 standard. *> \endverbatim diff --git a/lapack-netlib/SRC/ctgsen.f b/lapack-netlib/SRC/ctgsen.f index f9b6cd10c..ffd638099 100644 --- a/lapack-netlib/SRC/ctgsen.f +++ b/lapack-netlib/SRC/ctgsen.f @@ -339,7 +339,7 @@ *> [ kron(In2, B11) -kron(B22**H, In1) ]. *> *> Here, Inx is the identity matrix of size nx and A22**H is the -*> conjuguate transpose of A22. kron(X, Y) is the Kronecker product between +*> conjugate transpose of A22. kron(X, Y) is the Kronecker product between *> the matrices X and Y. *> *> When DIF(2) is small, small changes in (A, B) can cause large changes diff --git a/lapack-netlib/SRC/dgejsv.f b/lapack-netlib/SRC/dgejsv.f index 83d16c30e..798e9154d 100644 --- a/lapack-netlib/SRC/dgejsv.f +++ b/lapack-netlib/SRC/dgejsv.f @@ -362,7 +362,7 @@ *> *> \param[out] IWORK *> \verbatim -*> IWORK is INTEGER array, dimension (M+3*N). +*> IWORK is INTEGER array, dimension (MAX(3,M+3*N)). *> On exit, *> IWORK(1) = the numerical rank determined after the initial *> QR factorization with pivoting. See the descriptions @@ -1386,7 +1386,7 @@ IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 * (this overwrites the copy of R2, as it will not be -* needed in this branch, but it does not overwritte the +* needed in this branch, but it does not overwrite the * Huseholder vectors of Q2.). CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N ) * .. and the rest of the information on Q3 is in @@ -1409,7 +1409,7 @@ END IF * * Second preconditioning finished; continue with Jacobi SVD -* The input matrix is lower trinagular. +* The input matrix is lower triangular. * * Recover the right singular vectors as solution of a well * conditioned triangular matrix equation. @@ -1454,7 +1454,7 @@ * :) .. the input matrix A is very likely a relative of * the Kahan matrix :) * The matrix R2 is inverted. The solution of the matrix equation -* is Q3^T*V3 = the product of the Jacobi rotations (appplied to +* is Q3^T*V3 = the product of the Jacobi rotations (applied to * the lower triangular L3 from the LQ factorization of * R2=L3*Q3), pre-multiplied with the transposed Q3. CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, diff --git a/lapack-netlib/SRC/dgsvj0.f b/lapack-netlib/SRC/dgsvj0.f index 197a9a626..586723e64 100644 --- a/lapack-netlib/SRC/dgsvj0.f +++ b/lapack-netlib/SRC/dgsvj0.f @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is DOUBLE PRECISION array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/dgsvj1.f b/lapack-netlib/SRC/dgsvj1.f index 11d1dde2b..d757bb927 100644 --- a/lapack-netlib/SRC/dgsvj1.f +++ b/lapack-netlib/SRC/dgsvj1.f @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is DOUBLE PRECISION array, dimension (LDV,N) -*> If JOBV = 'V', then N rows of V are post-multipled by a +*> If JOBV = 'V', then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/dlalsa.f b/lapack-netlib/SRC/dlalsa.f index da8e0fa17..d89f5d2f9 100644 --- a/lapack-netlib/SRC/dlalsa.f +++ b/lapack-netlib/SRC/dlalsa.f @@ -43,9 +43,9 @@ *> *> \verbatim *> -*> DLALSA is an itermediate step in solving the least squares problem +*> DLALSA is an intermediate step in solving the least squares problem *> by computing the SVD of the coefficient matrix in compact form (The -*> singular vectors are computed as products of simple orthorgonal +*> singular vectors are computed as products of simple orthogonal *> matrices.). *> *> If ICOMPQ = 0, DLALSA applies the inverse of the left singular vector diff --git a/lapack-netlib/SRC/dlarre.f b/lapack-netlib/SRC/dlarre.f index 70f59b829..1cceed1a2 100644 --- a/lapack-netlib/SRC/dlarre.f +++ b/lapack-netlib/SRC/dlarre.f @@ -51,7 +51,7 @@ *> DSTEMR to compute the eigenvectors of T. *> The accuracy varies depending on whether bisection is used to *> find a few eigenvalues or the dqds algorithm (subroutine DLASQ2) to -*> conpute all and then discard any unwanted one. +*> compute all and then discard any unwanted one. *> As an added benefit, DLARRE also outputs the n *> Gerschgorin intervals for the matrices L_i D_i L_i^T. *> \endverbatim diff --git a/lapack-netlib/SRC/dstegr.f b/lapack-netlib/SRC/dstegr.f index 01ec101d8..598c60e8d 100644 --- a/lapack-netlib/SRC/dstegr.f +++ b/lapack-netlib/SRC/dstegr.f @@ -56,7 +56,7 @@ *> *> Note : DSTEGR and DSTEMR work only on machines which follow *> IEEE-754 floating-point standard in their handling of infinities and -*> NaNs. Normal execution may create these exceptiona values and hence +*> NaNs. Normal execution may create these exceptional values and hence *> may abort due to a floating point exception in environments which *> do not conform to the IEEE-754 standard. *> \endverbatim diff --git a/lapack-netlib/SRC/dtgevc.f b/lapack-netlib/SRC/dtgevc.f index e7084664c..be70b2083 100644 --- a/lapack-netlib/SRC/dtgevc.f +++ b/lapack-netlib/SRC/dtgevc.f @@ -52,7 +52,7 @@ *> *> S*x = w*P*x, (y**H)*S = w*(y**H)*P, *> -*> where y**H denotes the conjugate tranpose of y. +*> where y**H denotes the conjugate transpose of y. *> The eigenvalues are not input to this routine, but are computed *> directly from the diagonal blocks of S and P. *> @@ -337,7 +337,7 @@ EXTERNAL LSAME, DLAMCH * .. * .. External Subroutines .. - EXTERNAL DGEMV, DLABAD, DLACPY, DLAG2, DLALN2, XERBLA + EXTERNAL DGEMV, DLACPY, DLAG2, DLALN2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN @@ -463,7 +463,6 @@ * SAFMIN = DLAMCH( 'Safe minimum' ) BIG = ONE / SAFMIN - CALL DLABAD( SAFMIN, BIG ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) SMALL = SAFMIN*N / ULP BIG = ONE / SMALL diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F index c701c2be0..52d507e9d 100644 --- a/lapack-netlib/SRC/iparam2stage.F +++ b/lapack-netlib/SRC/iparam2stage.F @@ -89,14 +89,14 @@ *> *> \param[in] NBI *> \verbatim -*> NBI is INTEGER which is the used in the reduciton, +*> NBI is INTEGER which is the used in the reduction, *> (e.g., the size of the band), needed to compute workspace *> and LHOUS2. *> \endverbatim *> *> \param[in] IBI *> \verbatim -*> IBI is INTEGER which represent the IB of the reduciton, +*> IBI is INTEGER which represent the IB of the reduction, *> needed to compute workspace and LHOUS2. *> \endverbatim *> diff --git a/lapack-netlib/SRC/sgejsv.f b/lapack-netlib/SRC/sgejsv.f index 923573bdb..1333e54fb 100644 --- a/lapack-netlib/SRC/sgejsv.f +++ b/lapack-netlib/SRC/sgejsv.f @@ -1386,7 +1386,7 @@ IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 * (this overwrites the copy of R2, as it will not be -* needed in this branch, but it does not overwritte the +* needed in this branch, but it does not overwrite the * Huseholder vectors of Q2.). CALL SLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N ) * .. and the rest of the information on Q3 is in @@ -1409,7 +1409,7 @@ END IF * * Second preconditioning finished; continue with Jacobi SVD -* The input matrix is lower trinagular. +* The input matrix is lower triangular. * * Recover the right singular vectors as solution of a well * conditioned triangular matrix equation. @@ -1454,7 +1454,7 @@ * :) .. the input matrix A is very likely a relative of * the Kahan matrix :) * The matrix R2 is inverted. The solution of the matrix equation -* is Q3^T*V3 = the product of the Jacobi rotations (appplied to +* is Q3^T*V3 = the product of the Jacobi rotations (applied to * the lower triangular L3 from the LQ factorization of * R2=L3*Q3), pre-multiplied with the transposed Q3. CALL SGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, diff --git a/lapack-netlib/SRC/sgsvj0.f b/lapack-netlib/SRC/sgsvj0.f index 23b6f0077..9249f4219 100644 --- a/lapack-netlib/SRC/sgsvj0.f +++ b/lapack-netlib/SRC/sgsvj0.f @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is REAL array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/sgsvj1.f b/lapack-netlib/SRC/sgsvj1.f index 0345ccb42..79fd4d2d3 100644 --- a/lapack-netlib/SRC/sgsvj1.f +++ b/lapack-netlib/SRC/sgsvj1.f @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is REAL array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/slalsa.f b/lapack-netlib/SRC/slalsa.f index 465455e4a..95becc76e 100644 --- a/lapack-netlib/SRC/slalsa.f +++ b/lapack-netlib/SRC/slalsa.f @@ -43,9 +43,9 @@ *> *> \verbatim *> -*> SLALSA is an itermediate step in solving the least squares problem +*> SLALSA is an intermediate step in solving the least squares problem *> by computing the SVD of the coefficient matrix in compact form (The -*> singular vectors are computed as products of simple orthorgonal +*> singular vectors are computed as products of simple orthogonal *> matrices.). *> *> If ICOMPQ = 0, SLALSA applies the inverse of the left singular vector diff --git a/lapack-netlib/SRC/slarre.f b/lapack-netlib/SRC/slarre.f index 2e34ca5a6..e1b52c385 100644 --- a/lapack-netlib/SRC/slarre.f +++ b/lapack-netlib/SRC/slarre.f @@ -51,7 +51,7 @@ *> SSTEMR to compute the eigenvectors of T. *> The accuracy varies depending on whether bisection is used to *> find a few eigenvalues or the dqds algorithm (subroutine SLASQ2) to -*> conpute all and then discard any unwanted one. +*> compute all and then discard any unwanted one. *> As an added benefit, SLARRE also outputs the n *> Gerschgorin intervals for the matrices L_i D_i L_i^T. *> \endverbatim diff --git a/lapack-netlib/SRC/sstegr.f b/lapack-netlib/SRC/sstegr.f index 2e2975fdf..2967a6969 100644 --- a/lapack-netlib/SRC/sstegr.f +++ b/lapack-netlib/SRC/sstegr.f @@ -56,7 +56,7 @@ *> *> Note : SSTEGR and SSTEMR work only on machines which follow *> IEEE-754 floating-point standard in their handling of infinities and -*> NaNs. Normal execution may create these exceptiona values and hence +*> NaNs. Normal execution may create these exceptional values and hence *> may abort due to a floating point exception in environments which *> do not conform to the IEEE-754 standard. *> \endverbatim diff --git a/lapack-netlib/SRC/stgevc.f b/lapack-netlib/SRC/stgevc.f index 15fc88c4b..be4cb1829 100644 --- a/lapack-netlib/SRC/stgevc.f +++ b/lapack-netlib/SRC/stgevc.f @@ -52,7 +52,7 @@ *> *> S*x = w*P*x, (y**H)*S = w*(y**H)*P, *> -*> where y**H denotes the conjugate tranpose of y. +*> where y**H denotes the conjugate transpose of y. *> The eigenvalues are not input to this routine, but are computed *> directly from the diagonal blocks of S and P. *> @@ -337,7 +337,7 @@ EXTERNAL LSAME, SLAMCH * .. * .. External Subroutines .. - EXTERNAL SGEMV, SLABAD, SLACPY, SLAG2, SLALN2, XERBLA + EXTERNAL SGEMV, SLACPY, SLAG2, SLALN2, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN @@ -463,7 +463,6 @@ * SAFMIN = SLAMCH( 'Safe minimum' ) BIG = ONE / SAFMIN - CALL SLABAD( SAFMIN, BIG ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) SMALL = SAFMIN*N / ULP BIG = ONE / SMALL diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index 464c4a0fa..a36a9188a 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -1821,7 +1821,7 @@ IF ( CONDR2 .GE. COND_OK ) THEN * .. save the Householder vectors used for Q3 * (this overwrites the copy of R2, as it will not be -* needed in this branch, but it does not overwritte the +* needed in this branch, but it does not overwrite the * Huseholder vectors of Q2.). CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N ) * .. and the rest of the information on Q3 is in @@ -1844,7 +1844,7 @@ END IF * * Second preconditioning finished; continue with Jacobi SVD -* The input matrix is lower trinagular. +* The input matrix is lower triangular. * * Recover the right singular vectors as solution of a well * conditioned triangular matrix equation. @@ -1888,7 +1888,7 @@ ELSE IF ( CONDR2 .LT. COND_OK ) THEN * * The matrix R2 is inverted. The solution of the matrix equation -* is Q3^* * V3 = the product of the Jacobi rotations (appplied to +* is Q3^* * V3 = the product of the Jacobi rotations (applied to * the lower triangular L3 from the LQ factorization of * R2=L3*Q3), pre-multiplied with the transposed Q3. CALL ZGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U, diff --git a/lapack-netlib/SRC/zgsvj0.f b/lapack-netlib/SRC/zgsvj0.f index 672ab7590..11bfbd8dc 100644 --- a/lapack-netlib/SRC/zgsvj0.f +++ b/lapack-netlib/SRC/zgsvj0.f @@ -117,7 +117,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -125,9 +125,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX*16 array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/zgsvj1.f b/lapack-netlib/SRC/zgsvj1.f index 2461a9555..efe0384ef 100644 --- a/lapack-netlib/SRC/zgsvj1.f +++ b/lapack-netlib/SRC/zgsvj1.f @@ -147,7 +147,7 @@ *> \param[in] MV *> \verbatim *> MV is INTEGER -*> If JOBV = 'A', then MV rows of V are post-multipled by a +*> If JOBV = 'A', then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then MV is not referenced. *> \endverbatim @@ -155,9 +155,9 @@ *> \param[in,out] V *> \verbatim *> V is COMPLEX*16 array, dimension (LDV,N) -*> If JOBV = 'V' then N rows of V are post-multipled by a +*> If JOBV = 'V' then N rows of V are post-multiplied by a *> sequence of Jacobi rotations. -*> If JOBV = 'A' then MV rows of V are post-multipled by a +*> If JOBV = 'A' then MV rows of V are post-multiplied by a *> sequence of Jacobi rotations. *> If JOBV = 'N', then V is not referenced. *> \endverbatim diff --git a/lapack-netlib/SRC/zlalsa.f b/lapack-netlib/SRC/zlalsa.f index 73ccf3a8b..d419598d2 100644 --- a/lapack-netlib/SRC/zlalsa.f +++ b/lapack-netlib/SRC/zlalsa.f @@ -42,9 +42,9 @@ *> *> \verbatim *> -*> ZLALSA is an itermediate step in solving the least squares problem +*> ZLALSA is an intermediate step in solving the least squares problem *> by computing the SVD of the coefficient matrix in compact form (The -*> singular vectors are computed as products of simple orthorgonal +*> singular vectors are computed as products of simple orthogonal *> matrices.). *> *> If ICOMPQ = 0, ZLALSA applies the inverse of the left singular vector diff --git a/lapack-netlib/SRC/zstegr.f b/lapack-netlib/SRC/zstegr.f index ee43a0d21..3736a0517 100644 --- a/lapack-netlib/SRC/zstegr.f +++ b/lapack-netlib/SRC/zstegr.f @@ -56,7 +56,7 @@ *> *> Note : ZSTEGR and ZSTEMR work only on machines which follow *> IEEE-754 floating-point standard in their handling of infinities and -*> NaNs. Normal execution may create these exceptiona values and hence +*> NaNs. Normal execution may create these exceptional values and hence *> may abort due to a floating point exception in environments which *> do not conform to the IEEE-754 standard. *> \endverbatim diff --git a/lapack-netlib/SRC/ztgsy2.f b/lapack-netlib/SRC/ztgsy2.f index ee26b5e7b..0cae8939e 100644 --- a/lapack-netlib/SRC/ztgsy2.f +++ b/lapack-netlib/SRC/ztgsy2.f @@ -57,7 +57,7 @@ *> Z = [ kron(In, A) -kron(B**H, Im) ] (2) *> [ kron(In, D) -kron(E**H, Im) ], *> -*> Ik is the identity matrix of size k and X**H is the conjuguate transpose of X. +*> Ik is the identity matrix of size k and X**H is the conjugate transpose of X. *> kron(X, Y) is the Kronecker product between the matrices X and Y. *> *> If TRANS = 'C', y in the conjugate transposed system Z**H*y = scale*b From b01894adcb411abc976869e4a6b5bfb3bd697b69 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 May 2023 14:35:13 +0200 Subject: [PATCH 075/718] Fix typos in comments (Reference-LAPACK PR 814) --- lapack-netlib/TESTING/EIG/cdrgvx.f | 2 +- lapack-netlib/TESTING/EIG/ddrgvx.f | 2 +- lapack-netlib/TESTING/EIG/sdrgvx.f | 2 +- lapack-netlib/TESTING/EIG/zdrgvx.f | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cdrgvx.f b/lapack-netlib/TESTING/EIG/cdrgvx.f index 830a39d86..4114b697f 100644 --- a/lapack-netlib/TESTING/EIG/cdrgvx.f +++ b/lapack-netlib/TESTING/EIG/cdrgvx.f @@ -56,7 +56,7 @@ *> *> | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) ) *> -*> where l**H is the conjugate tranpose of l. +*> where l**H is the conjugate transpose of l. *> *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of *> diff --git a/lapack-netlib/TESTING/EIG/ddrgvx.f b/lapack-netlib/TESTING/EIG/ddrgvx.f index c63762134..3de72eb70 100644 --- a/lapack-netlib/TESTING/EIG/ddrgvx.f +++ b/lapack-netlib/TESTING/EIG/ddrgvx.f @@ -56,7 +56,7 @@ *> *> | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) ) *> -*> where l**H is the conjugate tranpose of l. +*> where l**H is the conjugate transpose of l. *> *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of *> diff --git a/lapack-netlib/TESTING/EIG/sdrgvx.f b/lapack-netlib/TESTING/EIG/sdrgvx.f index f6da720ad..91803f60a 100644 --- a/lapack-netlib/TESTING/EIG/sdrgvx.f +++ b/lapack-netlib/TESTING/EIG/sdrgvx.f @@ -57,7 +57,7 @@ *> *> | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) ) *> -*> where l**H is the conjugate tranpose of l. +*> where l**H is the conjugate transpose of l. *> *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of *> diff --git a/lapack-netlib/TESTING/EIG/zdrgvx.f b/lapack-netlib/TESTING/EIG/zdrgvx.f index 72e0303fb..813151f13 100644 --- a/lapack-netlib/TESTING/EIG/zdrgvx.f +++ b/lapack-netlib/TESTING/EIG/zdrgvx.f @@ -55,7 +55,7 @@ *> *> | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) ) *> -*> where l**H is the conjugate tranpose of l. +*> where l**H is the conjugate transpose of l. *> *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of *> From 47715b57268cf3f32ad37b90e65cd5932fdb5620 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 May 2023 14:36:21 +0200 Subject: [PATCH 076/718] Fix typos in comments (Reference-LAPACK PR 814) --- lapack-netlib/TESTING/LIN/crzt01.f | 2 +- lapack-netlib/TESTING/LIN/drzt01.f | 2 +- lapack-netlib/TESTING/LIN/srzt01.f | 2 +- lapack-netlib/TESTING/LIN/zrzt01.f | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/crzt01.f b/lapack-netlib/TESTING/LIN/crzt01.f index 6a75a8b31..94fdcb4a9 100644 --- a/lapack-netlib/TESTING/LIN/crzt01.f +++ b/lapack-netlib/TESTING/LIN/crzt01.f @@ -158,7 +158,7 @@ * * R = R * P(1) * ... *P(m) * - CALL CUNMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU, + CALL CUNMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU, $ WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO ) * * R = R - A diff --git a/lapack-netlib/TESTING/LIN/drzt01.f b/lapack-netlib/TESTING/LIN/drzt01.f index 8e969aba7..7a88a2c20 100644 --- a/lapack-netlib/TESTING/LIN/drzt01.f +++ b/lapack-netlib/TESTING/LIN/drzt01.f @@ -158,7 +158,7 @@ * * R = R * P(1) * ... *P(m) * - CALL DORMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU, + CALL DORMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU, $ WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO ) * * R = R - A diff --git a/lapack-netlib/TESTING/LIN/srzt01.f b/lapack-netlib/TESTING/LIN/srzt01.f index 4478c0506..a2b8b29ba 100644 --- a/lapack-netlib/TESTING/LIN/srzt01.f +++ b/lapack-netlib/TESTING/LIN/srzt01.f @@ -158,7 +158,7 @@ * * R = R * P(1) * ... *P(m) * - CALL SORMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU, + CALL SORMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU, $ WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO ) * * R = R - A diff --git a/lapack-netlib/TESTING/LIN/zrzt01.f b/lapack-netlib/TESTING/LIN/zrzt01.f index 2eba7ba6f..b6e4d73f9 100644 --- a/lapack-netlib/TESTING/LIN/zrzt01.f +++ b/lapack-netlib/TESTING/LIN/zrzt01.f @@ -159,7 +159,7 @@ * * R = R * P(1) * ... *P(m) * - CALL ZUNMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU, + CALL ZUNMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU, $ WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO ) * * R = R - A From be05ba4374b40310cef2c874e797e76772119f4c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 May 2023 16:28:20 +0200 Subject: [PATCH 077/718] Fix typos in comments and documentation of LAPACK (Reference-LAPACK PR 820) (#4045) * Fix typos in comments and documentation (Reference-LAPACK PR 820) --- .../CMAKE/CheckFortranTypeSizes.cmake | 2 +- lapack-netlib/CMAKE/Findcodecov.cmake | 4 +- lapack-netlib/CMakeLists.txt | 300 ++++++++++++++++-- lapack-netlib/DOCS/lawn81.tex | 2 +- .../LAPACKE/utils/lapacke_cgb_trans.c | 2 +- .../LAPACKE/utils/lapacke_dgb_trans.c | 2 +- .../LAPACKE/utils/lapacke_sgb_trans.c | 2 +- .../LAPACKE/utils/lapacke_zgb_trans.c | 2 +- lapack-netlib/SRC/DEPRECATED/cggsvd.f | 2 +- lapack-netlib/SRC/DEPRECATED/zggsvd.f | 2 +- lapack-netlib/SRC/cgejsv.f | 2 +- lapack-netlib/SRC/cgesvdq.f | 2 +- lapack-netlib/SRC/cgsvj0.f | 4 +- lapack-netlib/SRC/cgsvj1.f | 4 +- lapack-netlib/SRC/chesv_aa_2stage.f | 2 +- lapack-netlib/SRC/chetf2_rk.f | 8 +- lapack-netlib/SRC/chetf2_rook.f | 8 +- lapack-netlib/SRC/chetrf_aa.f | 2 +- lapack-netlib/SRC/chetrf_aa_2stage.f | 2 +- lapack-netlib/SRC/cla_gbrfsx_extended.f | 6 +- lapack-netlib/SRC/cla_gerfsx_extended.f | 2 +- lapack-netlib/SRC/cla_herfsx_extended.f | 2 +- lapack-netlib/SRC/cla_porfsx_extended.f | 2 +- lapack-netlib/SRC/cla_syrfsx_extended.f | 2 +- lapack-netlib/SRC/claed7.f | 2 +- lapack-netlib/SRC/claqz0.f | 7 +- lapack-netlib/SRC/clarfb_gett.f | 2 +- lapack-netlib/SRC/clatdf.f | 2 +- lapack-netlib/SRC/clatrs3.f | 2 +- lapack-netlib/SRC/csyconvf.f | 10 +- lapack-netlib/SRC/csyconvf_rook.f | 2 +- lapack-netlib/SRC/csysv_aa_2stage.f | 2 +- lapack-netlib/SRC/csytrf_aa.f | 2 +- lapack-netlib/SRC/csytrf_aa_2stage.f | 2 +- lapack-netlib/SRC/ctrexc.f | 2 +- lapack-netlib/SRC/cunbdb1.f | 2 +- lapack-netlib/SRC/cunbdb2.f | 2 +- lapack-netlib/SRC/cunbdb3.f | 2 +- lapack-netlib/SRC/cunbdb4.f | 2 +- lapack-netlib/SRC/dbdsvdx.f | 2 +- lapack-netlib/SRC/dgejsv.f | 2 +- lapack-netlib/SRC/dgesvdq.f | 2 +- lapack-netlib/SRC/dgsvj0.f | 4 +- lapack-netlib/SRC/dgsvj1.f | 4 +- lapack-netlib/SRC/dla_gbrfsx_extended.f | 2 +- lapack-netlib/SRC/dla_gerfsx_extended.f | 2 +- lapack-netlib/SRC/dla_porfsx_extended.f | 2 +- lapack-netlib/SRC/dla_syrfsx_extended.f | 2 +- lapack-netlib/SRC/dlaqz0.f | 7 +- lapack-netlib/SRC/dlarfb_gett.f | 2 +- lapack-netlib/SRC/dlatrs3.f | 2 +- lapack-netlib/SRC/dorbdb1.f | 2 +- lapack-netlib/SRC/dorbdb2.f | 2 +- lapack-netlib/SRC/dorbdb3.f | 2 +- lapack-netlib/SRC/dorbdb4.f | 2 +- lapack-netlib/SRC/dsyconvf.f | 10 +- lapack-netlib/SRC/dsyconvf_rook.f | 2 +- lapack-netlib/SRC/dsysv_aa_2stage.f | 2 +- lapack-netlib/SRC/dsytrf_aa.f | 2 +- lapack-netlib/SRC/dsytrf_aa_2stage.f | 2 +- lapack-netlib/SRC/sbdsvdx.f | 2 +- lapack-netlib/SRC/sgejsv.f | 2 +- lapack-netlib/SRC/sgesvdq.f | 2 +- lapack-netlib/SRC/sgsvj0.f | 4 +- lapack-netlib/SRC/sgsvj1.f | 4 +- lapack-netlib/SRC/sla_gbrfsx_extended.f | 2 +- lapack-netlib/SRC/sla_gerfsx_extended.f | 2 +- lapack-netlib/SRC/sla_porfsx_extended.f | 2 +- lapack-netlib/SRC/sla_syrfsx_extended.f | 2 +- lapack-netlib/SRC/slaqz0.f | 7 +- lapack-netlib/SRC/slarfb_gett.f | 2 +- lapack-netlib/SRC/slatrs3.f | 2 +- lapack-netlib/SRC/sorbdb1.f | 2 +- lapack-netlib/SRC/sorbdb2.f | 2 +- lapack-netlib/SRC/sorbdb3.f | 2 +- lapack-netlib/SRC/sorbdb4.f | 2 +- lapack-netlib/SRC/ssyconvf.f | 10 +- lapack-netlib/SRC/ssyconvf_rook.f | 2 +- lapack-netlib/SRC/ssysv_aa_2stage.f | 2 +- lapack-netlib/SRC/ssytrf_aa.f | 2 +- lapack-netlib/SRC/ssytrf_aa_2stage.f | 2 +- lapack-netlib/SRC/zgejsv.f | 2 +- lapack-netlib/SRC/zgesvdq.f | 2 +- lapack-netlib/SRC/zgsvj0.f | 4 +- lapack-netlib/SRC/zgsvj1.f | 4 +- lapack-netlib/SRC/zhesv_aa_2stage.f | 2 +- lapack-netlib/SRC/zhetf2_rk.f | 8 +- lapack-netlib/SRC/zhetf2_rook.f | 8 +- lapack-netlib/SRC/zhetrf_aa.f | 2 +- lapack-netlib/SRC/zhetrf_aa_2stage.f | 2 +- lapack-netlib/SRC/zla_gbrfsx_extended.f | 2 +- lapack-netlib/SRC/zla_gerfsx_extended.f | 2 +- lapack-netlib/SRC/zla_herfsx_extended.f | 2 +- lapack-netlib/SRC/zla_porfsx_extended.f | 2 +- lapack-netlib/SRC/zla_syrfsx_extended.f | 2 +- lapack-netlib/SRC/zlaed7.f | 2 +- lapack-netlib/SRC/zlaqz0.f | 7 +- lapack-netlib/SRC/zlarfb_gett.f | 2 +- lapack-netlib/SRC/zlatdf.f | 2 +- lapack-netlib/SRC/zlatrs3.f | 2 +- lapack-netlib/SRC/zsyconvf.f | 10 +- lapack-netlib/SRC/zsyconvf_rook.f | 2 +- lapack-netlib/SRC/zsysv_aa_2stage.f | 2 +- lapack-netlib/SRC/zsytrf_aa.f | 2 +- lapack-netlib/SRC/zsytrf_aa_2stage.f | 2 +- lapack-netlib/SRC/ztrexc.f | 2 +- lapack-netlib/SRC/zunbdb1.f | 2 +- lapack-netlib/SRC/zunbdb2.f | 2 +- lapack-netlib/SRC/zunbdb3.f | 2 +- lapack-netlib/SRC/zunbdb4.f | 2 +- lapack-netlib/TESTING/EIG/cchkst.f | 6 +- lapack-netlib/TESTING/EIG/cchkst2stg.f | 11 +- lapack-netlib/TESTING/EIG/cdrgsx.f | 5 +- lapack-netlib/TESTING/EIG/cdrvsg.f | 5 +- lapack-netlib/TESTING/EIG/cdrvsg2stg.f | 5 +- lapack-netlib/TESTING/EIG/cdrvst.f | 7 +- lapack-netlib/TESTING/EIG/cdrvst2stg.f | 5 +- lapack-netlib/TESTING/EIG/cerrst.f | 74 ++--- lapack-netlib/TESTING/EIG/dchkst.f | 11 +- lapack-netlib/TESTING/EIG/dchkst2stg.f | 11 +- lapack-netlib/TESTING/EIG/ddrgsx.f | 5 +- lapack-netlib/TESTING/EIG/ddrvsg.f | 5 +- lapack-netlib/TESTING/EIG/ddrvsg2stg.f | 5 +- lapack-netlib/TESTING/EIG/ddrvst.f | 13 +- lapack-netlib/TESTING/EIG/ddrvst2stg.f | 5 +- lapack-netlib/TESTING/EIG/schkst.f | 11 +- lapack-netlib/TESTING/EIG/schkst2stg.f | 11 +- lapack-netlib/TESTING/EIG/sdrgsx.f | 5 +- lapack-netlib/TESTING/EIG/sdrvsg.f | 5 +- lapack-netlib/TESTING/EIG/sdrvsg2stg.f | 5 +- lapack-netlib/TESTING/EIG/sdrvst.f | 13 +- lapack-netlib/TESTING/EIG/sdrvst2stg.f | 5 +- lapack-netlib/TESTING/EIG/zchkst.f | 12 +- lapack-netlib/TESTING/EIG/zchkst2stg.f | 13 +- lapack-netlib/TESTING/EIG/zdrgsx.f | 5 +- lapack-netlib/TESTING/EIG/zdrvsg.f | 5 +- lapack-netlib/TESTING/EIG/zdrvsg2stg.f | 5 +- lapack-netlib/TESTING/EIG/zdrvst.f | 11 +- lapack-netlib/TESTING/EIG/zdrvst2stg.f | 5 +- lapack-netlib/TESTING/LIN/alahd.f | 2 +- lapack-netlib/TESTING/LIN/cchktp.f | 2 +- lapack-netlib/TESTING/LIN/cerrhe.f | 4 +- lapack-netlib/TESTING/LIN/cerrhex.f | 4 +- lapack-netlib/TESTING/LIN/cerrsy.f | 4 +- lapack-netlib/TESTING/LIN/cerrsyx.f | 4 +- lapack-netlib/TESTING/LIN/cgtt01.f | 2 +- lapack-netlib/TESTING/LIN/cgtt02.f | 4 +- lapack-netlib/TESTING/LIN/chet01_3.f | 2 +- lapack-netlib/TESTING/LIN/clqt02.f | 2 +- lapack-netlib/TESTING/LIN/cptt01.f | 2 +- lapack-netlib/TESTING/LIN/cptt02.f | 2 +- lapack-netlib/TESTING/LIN/cqlt02.f | 2 +- lapack-netlib/TESTING/LIN/cqrt02.f | 2 +- lapack-netlib/TESTING/LIN/crqt02.f | 2 +- lapack-netlib/TESTING/LIN/csyt01_3.f | 2 +- lapack-netlib/TESTING/LIN/dchktp.f | 2 +- lapack-netlib/TESTING/LIN/ddrvab.f | 2 +- lapack-netlib/TESTING/LIN/ddrvac.f | 2 +- lapack-netlib/TESTING/LIN/derrsy.f | 4 +- lapack-netlib/TESTING/LIN/derrsyx.f | 4 +- lapack-netlib/TESTING/LIN/dgtt01.f | 2 +- lapack-netlib/TESTING/LIN/dgtt02.f | 4 +- lapack-netlib/TESTING/LIN/dlqt02.f | 2 +- lapack-netlib/TESTING/LIN/dptt01.f | 2 +- lapack-netlib/TESTING/LIN/dptt02.f | 2 +- lapack-netlib/TESTING/LIN/dqlt02.f | 2 +- lapack-netlib/TESTING/LIN/dqrt02.f | 2 +- lapack-netlib/TESTING/LIN/drqt02.f | 2 +- lapack-netlib/TESTING/LIN/dsyt01_3.f | 2 +- lapack-netlib/TESTING/LIN/schktp.f | 2 +- lapack-netlib/TESTING/LIN/serrsy.f | 4 +- lapack-netlib/TESTING/LIN/serrsyx.f | 4 +- lapack-netlib/TESTING/LIN/sgtt01.f | 2 +- lapack-netlib/TESTING/LIN/sgtt02.f | 4 +- lapack-netlib/TESTING/LIN/slqt02.f | 2 +- lapack-netlib/TESTING/LIN/sptt01.f | 2 +- lapack-netlib/TESTING/LIN/sptt02.f | 2 +- lapack-netlib/TESTING/LIN/sqlt02.f | 2 +- lapack-netlib/TESTING/LIN/sqrt02.f | 2 +- lapack-netlib/TESTING/LIN/srqt02.f | 2 +- lapack-netlib/TESTING/LIN/ssyt01_3.f | 2 +- lapack-netlib/TESTING/LIN/zchktp.f | 2 +- lapack-netlib/TESTING/LIN/zdrvab.f | 2 +- lapack-netlib/TESTING/LIN/zdrvac.f | 2 +- lapack-netlib/TESTING/LIN/zerrhe.f | 4 +- lapack-netlib/TESTING/LIN/zerrhex.f | 4 +- lapack-netlib/TESTING/LIN/zerrsy.f | 4 +- lapack-netlib/TESTING/LIN/zerrsyx.f | 4 +- lapack-netlib/TESTING/LIN/zgtt01.f | 2 +- lapack-netlib/TESTING/LIN/zgtt02.f | 4 +- lapack-netlib/TESTING/LIN/zhet01_3.f | 2 +- lapack-netlib/TESTING/LIN/zlqt02.f | 2 +- lapack-netlib/TESTING/LIN/zptt01.f | 2 +- lapack-netlib/TESTING/LIN/zptt02.f | 2 +- lapack-netlib/TESTING/LIN/zqlt02.f | 2 +- lapack-netlib/TESTING/LIN/zqrt02.f | 2 +- lapack-netlib/TESTING/LIN/zrqt02.f | 2 +- lapack-netlib/TESTING/LIN/zsyt01_3.f | 2 +- 198 files changed, 636 insertions(+), 414 deletions(-) diff --git a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake index 585ca26e7..17c0df80e 100644 --- a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake +++ b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake @@ -1,4 +1,4 @@ -# This module perdorms several try-compiles to determine the default integer +# This module performs several try-compiles to determine the default integer # size being used by the fortran compiler # # After execution, the following variables are set. If they are un set then diff --git a/lapack-netlib/CMAKE/Findcodecov.cmake b/lapack-netlib/CMAKE/Findcodecov.cmake index 384064007..93db45130 100644 --- a/lapack-netlib/CMAKE/Findcodecov.cmake +++ b/lapack-netlib/CMAKE/Findcodecov.cmake @@ -36,7 +36,7 @@ function(add_coverage TNAME) endfunction() -# Find the reuired flags foreach language. +# Find the required flags foreach language. set(CMAKE_REQUIRED_QUIET_SAVE ${CMAKE_REQUIRED_QUIET}) set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY}) @@ -118,7 +118,7 @@ function (codecov_path_of_source FILE RETURN_VAR) # If expression was found, SOURCEFILE is a generator-expression for an # object library. Currently we found no way to call this function automatic - # for the referenced target, so it must be called in the directoryso of the + # for the referenced target, so it must be called in the directory of the # object library definition. if(NOT "${_source}" STREQUAL "") set(${RETURN_VAR} "" PARENT_SCOPE) diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt index b704e72c5..fefaa8b89 100644 --- a/lapack-netlib/CMakeLists.txt +++ b/lapack-netlib/CMakeLists.txt @@ -1,16 +1,20 @@ -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 3.2) project(LAPACK Fortran C) set(LAPACK_MAJOR_VERSION 3) -set(LAPACK_MINOR_VERSION 9) +set(LAPACK_MINOR_VERSION 11) set(LAPACK_PATCH_VERSION 0) set( LAPACK_VERSION ${LAPACK_MAJOR_VERSION}.${LAPACK_MINOR_VERSION}.${LAPACK_PATCH_VERSION} ) -# Add the CMake directory for custon CMake modules +# Allow setting a prefix for the library names +set(CMAKE_STATIC_LIBRARY_PREFIX "lib${LIBRARY_PREFIX}") +set(CMAKE_SHARED_LIBRARY_PREFIX "lib${LIBRARY_PREFIX}") + +# Add the CMake directory for custom CMake modules set(CMAKE_MODULE_PATH "${LAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH}) # Export all symbols on Windows when building shared libraries @@ -41,6 +45,40 @@ if(_is_coverage_build) find_package(codecov) endif() +# By default test Fortran compiler complex abs and complex division +option(TEST_FORTRAN_COMPILER "Test Fortran compiler complex abs and complex division" OFF) +if( TEST_FORTRAN_COMPILER ) + + add_executable( test_zcomplexabs ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexabs.f ) + add_custom_target( run_test_zcomplexabs + COMMAND test_zcomplexabs 2> test_zcomplexabs.err + WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL + COMMENT "Running test_zcomplexabs in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexabs.err" + SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexabs.f ) + + add_executable( test_zcomplexdiv ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexdiv.f ) + add_custom_target( run_test_zcomplexdiv + COMMAND test_zcomplexdiv 2> test_zcomplexdiv.err + WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL + COMMENT "Running test_zcomplexdiv in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexdiv.err" + SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexdiv.f ) + + add_executable( test_zcomplexmult ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexmult.f ) + add_custom_target( run_test_zcomplexmult + COMMAND test_zcomplexmult 2> test_zcomplexmult.err + WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL + COMMENT "Running test_zcomplexmult in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexmult.err" + SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexmult.f ) + + add_executable( test_zminMax ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f ) + add_custom_target( run_test_zminMax + COMMAND test_zminMax 2> test_zminMax.err + WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL + COMMENT "Running test_zminMax in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zminMax.err" + SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f ) + +endif() + # By default static library option(BUILD_SHARED_LIBS "Build shared libraries" OFF) @@ -89,12 +127,57 @@ configure_file( include(PreventInSourceBuilds) include(PreventInBuildInstalls) +# Check if recursive flag exists +include(CheckFortranCompilerFlag) +if(CMAKE_Fortran_COMPILER_ID STREQUAL Flang) + check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag) +elseif(CMAKE_Fortran_COMPILER_ID STREQUAL GNU) + check_fortran_compiler_flag("-frecursive" _frecursiveFlag) +elseif(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) + check_fortran_compiler_flag("-recursive" _recursiveFlag) +elseif(CMAKE_Fortran_COMPILER_ID STREQUAL XL) + check_fortran_compiler_flag("-qrecur" _qrecurFlag) +elseif(CMAKE_Fortran_COMPILER_ID STREQUAL NAG) + check_fortran_compiler_flag("-recursive" _recursiveFlag) +else() + message(WARNING "Fortran local arrays should be allocated on the stack." + " Please use a compiler which guarantees that feature." + " See https://github.com/Reference-LAPACK/lapack/pull/188 and references therein.") +endif() + +# Add recursive flag +if(_MrecursiveFlag) + string(REGEX MATCH "-Mrecursive" output_test "${CMAKE_Fortran_FLAGS}") + if(NOT output_test) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive" + CACHE STRING "Recursive flag must be set" FORCE) + endif() +elseif(_frecursiveFlag) + string(REGEX MATCH "-frecursive" output_test "${CMAKE_Fortran_FLAGS}") + if(NOT output_test) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -frecursive" + CACHE STRING "Recursive flag must be set" FORCE) + endif() +elseif(_recursiveFlag) + string(REGEX MATCH "-recursive" output_test "${CMAKE_Fortran_FLAGS}") + if(NOT output_test) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive" + CACHE STRING "Recursive flag must be set" FORCE) + endif() +elseif(_qrecurFlag) + string(REGEX MATCH "-qrecur" output_test "${CMAKE_Fortran_FLAGS}") + if(NOT output_test) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qrecur" + CACHE STRING "Recursive flag must be set" FORCE) + endif() +endif() + if(UNIX) if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict") endif() if(CMAKE_Fortran_COMPILER_ID STREQUAL XL) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none") + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict") endif() # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler. # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin @@ -128,6 +211,22 @@ if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq) endif() endif() +# Add option to enable flat namespace for symbol resolution on macOS +if(APPLE) + option(USE_FLAT_NAMESPACE "Use flat namespaces for symbol resolution during build and runtime." OFF) + + if(USE_FLAT_NAMESPACE) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-flat_namespace") + set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-flat_namespace") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-flat_namespace") + else() + if(BUILD_SHARED_LIBS AND BUILD_TESTING) + message(WARNING + "LAPACK test suite might fail with shared libraries and the default two-level namespace. " + "Disable shared libraries or enable flat namespace for symbol resolution via -DUSE_FLAT_NAMESPACE=ON.") + endif() + endif() +endif() # -------------------------------------------------- set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKLIB}-targets) @@ -149,13 +248,14 @@ option(BUILD_TESTING "Build tests" ${_is_coverage_build}) include(CTest) message(STATUS "Build tests: ${BUILD_TESTING}") -# lapack_testing.py uses features from python 2.7 and greater if(BUILD_TESTING) - set(_msg "Looking for Python >= 2.7 needed for summary tests") + set(_msg "Looking for Python3 needed for summary tests") message(STATUS "${_msg}") - find_package(PythonInterp 2.7 QUIET) - if(PYTHONINTERP_FOUND) - message(STATUS "${_msg} - found (${PYTHON_VERSION_STRING})") + # find_package(PythonInterp 3) cannot be used because /usr/bin/python may be + # a Python2 interpreter. + find_program(PYTHON_EXECUTABLE python3) + if(PYTHON_EXECUTABLE) + message(STATUS "${_msg} - found") else() message(STATUS "${_msg} - not found (skipping summary tests)") endif() @@ -177,7 +277,7 @@ CheckLAPACKCompilerFlags() # Check second function include(CheckTimeFunction) -set(TIME_FUNC NONE ${TIME_FUNC}) +set(TIME_FUNC NONE) CHECK_TIME_FUNCTION(NONE TIME_FUNC) CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC) CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC) @@ -210,6 +310,7 @@ if(NOT (BUILD_SINGLE OR BUILD_DOUBLE OR BUILD_COMPLEX OR BUILD_COMPLEX16)) BUILD_SINGLE, BUILD_DOUBLE, BUILD_COMPLEX, BUILD_COMPLEX16.") endif() + # -------------------------------------------------- # Subdirectories that need to be processed option(USE_OPTIMIZED_BLAS "Whether or not to use an optimized BLAS library instead of included netlib BLAS" OFF) @@ -325,35 +426,80 @@ option(LAPACKE_WITH_TMG "Build LAPACKE with tmglib routines" OFF) if(LAPACKE_WITH_TMG) set(LAPACKE ON) endif() -if(BUILD_TESTING OR LAPACKE_WITH_TMG) #already included, avoid double inclusion + +# TMGLIB +# Cache export target +set(LAPACK_INSTALL_EXPORT_NAME_CACHE ${LAPACK_INSTALL_EXPORT_NAME}) +if(BUILD_TESTING OR LAPACKE_WITH_TMG) + if(LATESTLAPACK_FOUND AND LAPACKE_WITH_TMG) + set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) + # Check if dlatms (part of tmg) is found + CHECK_FORTRAN_FUNCTION_EXISTS("dlatms" LAPACK_WITH_TMGLIB_FOUND) + unset(CMAKE_REQUIRED_LIBRARIES) + if(NOT LAPACK_WITH_TMGLIB_FOUND) + # Build and install TMG as part of LAPACKE targets (as opposed to LAPACK + # targets) + set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKELIB}-targets) + endif() + endif() add_subdirectory(TESTING/MATGEN) endif() +# Reset export target +set(LAPACK_INSTALL_EXPORT_NAME ${LAPACK_INSTALL_EXPORT_NAME_CACHE}) +unset(LAPACK_INSTALL_EXPORT_NAME_CACHE) if(LAPACKE) add_subdirectory(LAPACKE) endif() + #------------------------------------- # BLAS++ / LAPACK++ option(BLAS++ "Build BLAS++" OFF) option(LAPACK++ "Build LAPACK++" OFF) - - + + function(_display_cpp_implementation_msg name) string(TOLOWER ${name} name_lc) message(STATUS "${name}++ enable") message(STATUS "----------------") message(STATUS "Thank you for your interest in ${name}++, a newly developed C++ API for ${name} library") message(STATUS "The objective of ${name}++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc.") - message(STATUS "We are still working on integrating ${name}++ in our library. For the moment, you can download directly ${name_lc}++ from https://bitbucket.org/icl/${name_lc}pp") message(STATUS "For support ${name}++ related question, please email: slate-user@icl.utk.edu") message(STATUS "----------------") endfunction() -if(BLAS++) +if (BLAS++) _display_cpp_implementation_msg("BLAS") + include(ExternalProject) + ExternalProject_Add(blaspp + URL https://bitbucket.org/icl/blaspp/downloads/blaspp-2020.10.02.tar.gz + CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp + BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make + INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install + ) + ExternalProject_Add_StepDependencies(blaspp build ${BLAS_LIBRARIES}) endif() -if(LAPACK++) +if (LAPACK++) + message (STATUS "linking lapack++ against ${LAPACK_LIBRARIES}") _display_cpp_implementation_msg("LAPACK") + include(ExternalProject) + if (BUILD_SHARED_LIBS) + ExternalProject_Add(lapackpp + URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz + CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp + BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make + INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install + ) + else () +# FIXME this does not really work as the libraries list gets converted to a semicolon-separated list somewhere in the lapack++ build files + ExternalProject_Add(lapackpp + URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz + CONFIGURE_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES="${PROJECT_BINARY_DIR}/lib/liblapack.a -lgfortran" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp + BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make + INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install + ) + endif() + ExternalProject_Add_StepDependencies(lapackpp build blaspp ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) endif() # -------------------------------------------------- @@ -370,7 +516,7 @@ set(CPACK_MONOLITHIC_INSTALL ON) set(CPACK_PACKAGE_INSTALL_DIRECTORY "LAPACK") if(WIN32 AND NOT UNIX) # There is a bug in NSI that does not handle full unix paths properly. Make - # sure there is at least one set of four (4) backlasshes. + # sure there is at least one set of four (4) backslashes. set(CPACK_NSIS_HELP_LINK "http:\\\\\\\\http://icl.cs.utk.edu/lapack-forum") set(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\www.netlib.org/lapack") set(CPACK_NSIS_CONTACT "lapack@eecs.utk.edu") @@ -396,10 +542,6 @@ if(NOT LATESTLAPACK_FOUND) set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKLIB}) endif() -if(BUILD_TESTING OR LAPACKE_WITH_TMG) - set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB}) -endif() - # Export lapack targets, not including lapacke, from the # install tree, if any. set(_lapack_config_install_guard_target "") @@ -424,6 +566,10 @@ if(LAPACKE) set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKELIB}) endif() +if(NOT LAPACK_WITH_TMGLIB_FOUND AND LAPACKE_WITH_TMG) + set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB}) +endif() + # Export lapack and lapacke targets from the build tree, if any. set(_lapack_config_build_guard_target "") if(ALL_TARGETS) @@ -461,4 +607,114 @@ install(FILES DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION} COMPONENT Development ) - +if (LAPACK++) + install( + DIRECTORY "${LAPACK_BINARY_DIR}/lib/" + DESTINATION ${CMAKE_INSTALL_LIBDIR} + FILES_MATCHING REGEX "liblapackpp.(a|so)$" + ) + install( + DIRECTORY "${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp/include/" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" + FILES_MATCHING REGEX "\\.(h|hh)$" + ) + write_basic_package_version_file( + "lapackppConfigVersion.cmake" + VERSION 2020.10.02 + COMPATIBILITY AnyNewerVersion + ) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/lib/lapackpp/lapackppConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/lib/lapackpp/lapackppConfigVersion.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/" + ) + +endif() +if (BLAS++) + write_basic_package_version_file( + "blasppConfigVersion.cmake" + VERSION 2020.10.02 + COMPATIBILITY AnyNewerVersion + ) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/lib/blaspp/blasppConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/lib/blaspp/blasppConfigVersion.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/" + ) + install( + DIRECTORY "${LAPACK_BINARY_DIR}/lib/" + DESTINATION ${CMAKE_INSTALL_LIBDIR} + FILES_MATCHING REGEX "libblaspp.(a|so)$" + ) + install( + DIRECTORY "${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp/include/" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" + FILES_MATCHING REGEX "\\.(h|hh)$" + ) +endif() + +# -------------------------------------------------- +# Generate MAN and/or HTML Documentation +option(BUILD_HTML_DOCUMENTATION "Create and install the HTML based API +documentation (requires Doxygen) - command: make html" OFF) +option(BUILD_MAN_DOCUMENTATION "Create and install the MAN based documentation (requires Doxygen) - command: make man" OFF) +message(STATUS "Build html documentation: ${BUILD_HTML_DOCUMENTATION}") +message(STATUS "Build man documentation: ${BUILD_MAN_DOCUMENTATION}") + +if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION) + find_package(Doxygen) + if(NOT DOXYGEN_FOUND) + message(WARNING "Doxygen is needed to build the documentation.") + + else() + + set(DOXYGEN_PROJECT_BRIEF "LAPACK: Linear Algebra PACKage") + set(DOXYGEN_PROJECT_NUMBER ${LAPACK_VERSION}) + set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DOCS) + set(PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png) + set(DOXYGEN_OPTIMIZE_FOR_FORTRAN YES) + set(DOXYGEN_SOURCE_BROWSER YES) + set(DISTRIBUTE_GROUP_DOC YES) + set(DOXYGEN_CREATE_SUBDIRS YES) + set(DOXYGEN_SEPARATE_MEMBER_PAGES YES) + set(DOXYGEN_EXTRACT_ALL YES) + set(DOXYGEN_FILE_PATTERNS "*.f;*.c;*.h") + set(DOXYGEN_RECURSIVE YES) + set(DOXYGEN_GENERATE_TREEVIEW YES) + set(DOXYGEN_INTERACTIVE_SVG YES) + set(DOXYGEN_QUIET YES) + set(DOXYGEN_WARNINGS NO) + set(DOXYGEN_GENERATE_HTML NO) + set(DOXYGEN_GENERATE_MAN NO) + + + if (BUILD_HTML_DOCUMENTATION) + set(DOXYGEN_GENERATE_HTML YES) + set(DOXYGEN_HTML_OUTPUT explore-html) + set(DOXYGEN_INLINE_SOURCES YES) + set(DOXYGEN_CALL_GRAPH YES) + set(DOXYGEN_CALLER_GRAPH YES) + + doxygen_add_docs( + html + ${PROJECT_SOURCE_DIR} + COMMENT "Generating html LAPACK documentation (it will take some time... time to grab a coffee)" + ) + endif() + if (BUILD_MAN_DOCUMENTATION) + set(DOXYGEN_GENERATE_MAN YES) + set(DOXYGEN_EXCLUDE SRC/VARIANTS) + set(DOXYGEN_MAN_LINKS YES) + set(DOXYGEN_INLINE_SOURCES NO) + set(DOXYGEN_CALL_GRAPH NO) + set(DOXYGEN_CALLER_GRAPH NO) + + doxygen_add_docs( + man + ${PROJECT_SOURCE_DIR} + COMMENT "Generating man LAPACK documentation" + ) + endif() + + endif() +endif() diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex index 794c2a7aa..668ca8d68 100644 --- a/lapack-netlib/DOCS/lawn81.tex +++ b/lapack-netlib/DOCS/lawn81.tex @@ -575,7 +575,7 @@ There are six machine-dependent functions in the test and timing package, at least three of which must be installed. They are \begin{tabbing} -MONOMO \= DOUBLE PRECYSION \= \kill +MONOMO \= DOUBLE PRECISION \= \kill LSAME \> LOGICAL \> Test if two characters are the same regardless of case \\ SLAMCH \> REAL \> Determine machine-dependent parameters \\ DLAMCH \> DOUBLE PRECISION \> Determine machine-dependent parameters \\ diff --git a/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c index 2adf71493..b32fc2f9e 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c @@ -54,7 +54,7 @@ void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n, } } else if ( matrix_layout == LAPACK_ROW_MAJOR ) { /* TODO: interchange loops for performance. - * This is just reference impemeltation. + * This is just reference implementation. */ for( j = 0; j < MIN( n, ldin ); j++ ) { for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 ); diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c index 6624936a6..89e421eae 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c @@ -54,7 +54,7 @@ void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n, } } else if ( matrix_layout == LAPACK_ROW_MAJOR ) { /* TODO: interchange loops for performance. - * This is just reference impemeltation. + * This is just reference implementation. */ for( j = 0; j < MIN( n, ldin ); j++ ) { for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 ); diff --git a/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c index 046fdb9ca..a90c9617a 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c @@ -54,7 +54,7 @@ void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n, } } else if ( matrix_layout == LAPACK_ROW_MAJOR ) { /* TODO: interchange loops for performance. - * This is just reference impemeltation. + * This is just reference implementation. */ for( j = 0; j < MIN( n, ldin ); j++ ) { for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 ); diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c index 32a9de379..63323b1c3 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c @@ -54,7 +54,7 @@ void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n, } } else if ( matrix_layout == LAPACK_ROW_MAJOR ) { /* TODO: interchange loops for performance. - * This is just reference impemeltation + * This is just reference implementation */ for( j = 0; j < MIN( n, ldin ); j++ ) { for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 ); diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvd.f b/lapack-netlib/SRC/DEPRECATED/cggsvd.f index e6fdb47e5..515ac8d49 100644 --- a/lapack-netlib/SRC/DEPRECATED/cggsvd.f +++ b/lapack-netlib/SRC/DEPRECATED/cggsvd.f @@ -107,7 +107,7 @@ *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of *> A and B implicitly gives the SVD of A*inv(B): *> A*inv(B) = U*(D1*inv(D2))*V**H. -*> If ( A**H,B**H)**H has orthnormal columns, then the GSVD of A and B is also +*> If ( A**H,B**H)**H has orthonormal columns, then the GSVD of A and B is also *> equal to the CS decomposition of A and B. Furthermore, the GSVD can *> be used to derive the solution of the eigenvalue problem: *> A**H*A x = lambda* B**H*B x. diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvd.f b/lapack-netlib/SRC/DEPRECATED/zggsvd.f index c0b9247a6..8a41e36c6 100644 --- a/lapack-netlib/SRC/DEPRECATED/zggsvd.f +++ b/lapack-netlib/SRC/DEPRECATED/zggsvd.f @@ -106,7 +106,7 @@ *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of *> A and B implicitly gives the SVD of A*inv(B): *> A*inv(B) = U*(D1*inv(D2))*V**H. -*> If ( A**H,B**H)**H has orthnormal columns, then the GSVD of A and B is also +*> If ( A**H,B**H)**H has orthonormal columns, then the GSVD of A and B is also *> equal to the CS decomposition of A and B. Furthermore, the GSVD can *> be used to derive the solution of the eigenvalue problem: *> A**H*A x = lambda* B**H*B x. diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index 2b7b2dc70..1fc75613e 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -252,7 +252,7 @@ *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; *> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), -*> then V is used as workspace if the pprocedure +*> then V is used as workspace if the procedure *> replaces A with A^*. In that case, [U] is computed *> in V as right singular vectors of A^* and then *> copied back to the U array. This 'W' option is just diff --git a/lapack-netlib/SRC/cgesvdq.f b/lapack-netlib/SRC/cgesvdq.f index 1a587eca3..95091c2c0 100644 --- a/lapack-netlib/SRC/cgesvdq.f +++ b/lapack-netlib/SRC/cgesvdq.f @@ -363,7 +363,7 @@ *> an optimal implementation would do all necessary scaling before calling *> CGESVD and the scaling in CGESVD can be switched off. *> 3. Other comments related to code optimization are given in comments in the -*> code, enlosed in [[double brackets]]. +*> code, enclosed in [[double brackets]]. *> \endverbatim * *> \par Bugs, examples and comments diff --git a/lapack-netlib/SRC/cgsvj0.f b/lapack-netlib/SRC/cgsvj0.f index 9e6053013..7426ecdf1 100644 --- a/lapack-netlib/SRC/cgsvj0.f +++ b/lapack-netlib/SRC/cgsvj0.f @@ -52,10 +52,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim diff --git a/lapack-netlib/SRC/cgsvj1.f b/lapack-netlib/SRC/cgsvj1.f index 6d68e920d..18dd690cc 100644 --- a/lapack-netlib/SRC/cgsvj1.f +++ b/lapack-netlib/SRC/cgsvj1.f @@ -75,10 +75,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index e474c6fad..36970a329 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -87,7 +87,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/chetf2_rk.f b/lapack-netlib/SRC/chetf2_rk.f index 4bb032382..e687ec64a 100644 --- a/lapack-netlib/SRC/chetf2_rk.f +++ b/lapack-netlib/SRC/chetf2_rk.f @@ -480,7 +480,7 @@ A( J, K ) = CONJG( A( P, J ) ) A( P, J ) = T 14 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = CONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( K, K ) ) @@ -508,7 +508,7 @@ A( J, KK ) = CONJG( A( KP, J ) ) A( KP, J ) = T 15 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = CONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( KK, KK ) ) @@ -834,7 +834,7 @@ A( J, K ) = CONJG( A( P, J ) ) A( P, J ) = T 44 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = CONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( K, K ) ) @@ -862,7 +862,7 @@ A( J, KK ) = CONJG( A( KP, J ) ) A( KP, J ) = T 45 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = CONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( KK, KK ) ) diff --git a/lapack-netlib/SRC/chetf2_rook.f b/lapack-netlib/SRC/chetf2_rook.f index ee4eaf68f..49fba1bda 100644 --- a/lapack-netlib/SRC/chetf2_rook.f +++ b/lapack-netlib/SRC/chetf2_rook.f @@ -420,7 +420,7 @@ A( J, K ) = CONJG( A( P, J ) ) A( P, J ) = T 14 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = CONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( K, K ) ) @@ -441,7 +441,7 @@ A( J, KK ) = CONJG( A( KP, J ) ) A( KP, J ) = T 15 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = CONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( KK, KK ) ) @@ -733,7 +733,7 @@ A( J, K ) = CONJG( A( P, J ) ) A( P, J ) = T 44 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = CONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( K, K ) ) @@ -754,7 +754,7 @@ A( J, KK ) = CONJG( A( KP, J ) ) A( KP, J ) = T 45 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = CONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = REAL( A( KK, KK ) ) diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f index 54567b8cd..d9e4fbd19 100644 --- a/lapack-netlib/SRC/chetrf_aa.f +++ b/lapack-netlib/SRC/chetrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f index 20a9859e8..400efdf26 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.f +++ b/lapack-netlib/SRC/chetrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/cla_gbrfsx_extended.f b/lapack-netlib/SRC/cla_gbrfsx_extended.f index 6d43c8325..e8b96fd6e 100644 --- a/lapack-netlib/SRC/cla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/cla_gbrfsx_extended.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE CLA_GBRFSX_EXTENDED ( PREC_TYPE, TRANS_TYPE, N, KL, KU, +* SUBROUTINE CLA_GBRFSX_EXTENDED( PREC_TYPE, TRANS_TYPE, N, KL, KU, * NRHS, AB, LDAB, AFB, LDAFB, IPIV, * COLEQU, C, B, LDB, Y, LDY, * BERR_OUT, N_NORMS, ERR_BNDS_NORM, @@ -400,7 +400,7 @@ *> \ingroup complexGBcomputational * * ===================================================================== - SUBROUTINE CLA_GBRFSX_EXTENDED ( PREC_TYPE, TRANS_TYPE, N, KL, KU, + SUBROUTINE CLA_GBRFSX_EXTENDED( PREC_TYPE, TRANS_TYPE, N, KL, KU, $ NRHS, AB, LDAB, AFB, LDAFB, IPIV, $ COLEQU, C, B, LDB, Y, LDY, $ BERR_OUT, N_NORMS, ERR_BNDS_NORM, @@ -651,7 +651,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL CAXPY( N, (1.0E+0,0.0E+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/cla_gerfsx_extended.f b/lapack-netlib/SRC/cla_gerfsx_extended.f index 4b1031101..e524db5ad 100644 --- a/lapack-netlib/SRC/cla_gerfsx_extended.f +++ b/lapack-netlib/SRC/cla_gerfsx_extended.f @@ -637,7 +637,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL CAXPY( N, (1.0E+0,0.0E+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/cla_herfsx_extended.f b/lapack-netlib/SRC/cla_herfsx_extended.f index 6d007ef58..19f845692 100644 --- a/lapack-netlib/SRC/cla_herfsx_extended.f +++ b/lapack-netlib/SRC/cla_herfsx_extended.f @@ -654,7 +654,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/cla_porfsx_extended.f b/lapack-netlib/SRC/cla_porfsx_extended.f index 9ced9b1b9..2dafecb35 100644 --- a/lapack-netlib/SRC/cla_porfsx_extended.f +++ b/lapack-netlib/SRC/cla_porfsx_extended.f @@ -625,7 +625,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/cla_syrfsx_extended.f b/lapack-netlib/SRC/cla_syrfsx_extended.f index 4fe538a98..95f969731 100644 --- a/lapack-netlib/SRC/cla_syrfsx_extended.f +++ b/lapack-netlib/SRC/cla_syrfsx_extended.f @@ -654,7 +654,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/claed7.f b/lapack-netlib/SRC/claed7.f index 9d2f97141..72f1417d4 100644 --- a/lapack-netlib/SRC/claed7.f +++ b/lapack-netlib/SRC/claed7.f @@ -363,7 +363,7 @@ RETURN END IF * -* Prepare the INDXQ sorting premutation. +* Prepare the INDXQ sorting permutation. * N1 = K N2 = N - K diff --git a/lapack-netlib/SRC/claqz0.f b/lapack-netlib/SRC/claqz0.f index 6de40e06c..c6cc5847d 100644 --- a/lapack-netlib/SRC/claqz0.f +++ b/lapack-netlib/SRC/claqz0.f @@ -89,7 +89,7 @@ *> Anal., 29(2006), pp. 199--227. *> *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift, -*> multipole rational QZ method with agressive early deflation" +*> multipole rational QZ method with aggressive early deflation" *> \endverbatim * * Arguments: @@ -310,7 +310,7 @@ CHARACTER :: JBCMPZ*3 * External Functions - EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, SLABAD, + EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, $ CLARTG, CROT REAL, EXTERNAL :: SLAMCH, CLANHS LOGICAL, EXTERNAL :: LSAME @@ -462,7 +462,6 @@ * Get machine constants SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE/SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) @@ -533,7 +532,7 @@ DO WHILE ( K.GE.ISTART2 ) IF( ABS( B( K, K ) ) .LT. BTOL ) THEN -* A diagonal element of B is negligable, move it +* A diagonal element of B is negligible, move it * to the top and deflate it DO K2 = K, ISTART2+1, -1 diff --git a/lapack-netlib/SRC/clarfb_gett.f b/lapack-netlib/SRC/clarfb_gett.f index ee6959ed8..5f042e345 100644 --- a/lapack-netlib/SRC/clarfb_gett.f +++ b/lapack-netlib/SRC/clarfb_gett.f @@ -452,7 +452,7 @@ IF( LNOTIDENT ) THEN * * col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2, -* V1 is not an identy matrix, but unit lower-triangular +* V1 is not an identity matrix, but unit lower-triangular * V1 stored in A1 (diagonal ones are not stored). * * diff --git a/lapack-netlib/SRC/clatdf.f b/lapack-netlib/SRC/clatdf.f index 5445e387e..1e7d71669 100644 --- a/lapack-netlib/SRC/clatdf.f +++ b/lapack-netlib/SRC/clatdf.f @@ -227,7 +227,7 @@ BM = RHS( J ) - CONE SPLUS = ONE * -* Lockahead for L- part RHS(1:N-1) = +-1 +* Look-ahead for L- part RHS(1:N-1) = +-1 * SPLUS and SMIN computed more efficiently than in BSOLVE[1]. * SPLUS = SPLUS + REAL( CDOTC( N-J, Z( J+1, J ), 1, Z( J+1, diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f index a902f1ed0..0502f6898 100644 --- a/lapack-netlib/SRC/clatrs3.f +++ b/lapack-netlib/SRC/clatrs3.f @@ -577,7 +577,7 @@ * Prepare the linear update to be executed with GEMM. * For each column, compute a consistent scaling, a * scaling factor to survive the linear update, and -* rescale the column segments, if necesssary. Then +* rescale the column segments, if necessary. Then * the linear update is safely executed. * DO KK = 1, K2-K1 diff --git a/lapack-netlib/SRC/csyconvf.f b/lapack-netlib/SRC/csyconvf.f index f3b46f0b4..febbc411c 100644 --- a/lapack-netlib/SRC/csyconvf.f +++ b/lapack-netlib/SRC/csyconvf.f @@ -39,7 +39,7 @@ *> CSYTRF provided on entry in parameter A into the factorization *> output format used in CSYTRF_RK (or CSYTRF_BK) that is stored *> on exit in parameters A and E. It also converts in place details of -*> the intechanges stored in IPIV from the format used in CSYTRF into +*> the interchanges stored in IPIV from the format used in CSYTRF into *> the format used in CSYTRF_RK (or CSYTRF_BK). *> *> If parameter WAY = 'R': @@ -48,7 +48,7 @@ *> (or CSYTRF_BK) provided on entry in parameters A and E into *> the factorization output format used in CSYTRF that is stored *> on exit in parameter A. It also converts in place details of -*> the intechanges stored in IPIV from the format used in CSYTRF_RK +*> the interchanges stored in IPIV from the format used in CSYTRF_RK *> (or CSYTRF_BK) into the format used in CSYTRF. *> *> CSYCONVF can also convert in Hermitian matrix case, i.e. between @@ -325,7 +325,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -469,7 +469,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -535,7 +535,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/csyconvf_rook.f b/lapack-netlib/SRC/csyconvf_rook.f index a5b9d82da..0da34e0fa 100644 --- a/lapack-netlib/SRC/csyconvf_rook.f +++ b/lapack-netlib/SRC/csyconvf_rook.f @@ -520,7 +520,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f index d8881a748..22227505c 100644 --- a/lapack-netlib/SRC/csysv_aa_2stage.f +++ b/lapack-netlib/SRC/csysv_aa_2stage.f @@ -87,7 +87,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/csytrf_aa.f b/lapack-netlib/SRC/csytrf_aa.f index b1165a425..c5467bf01 100644 --- a/lapack-netlib/SRC/csytrf_aa.f +++ b/lapack-netlib/SRC/csytrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f index 8a2cfd7bc..b21df8cd3 100644 --- a/lapack-netlib/SRC/csytrf_aa_2stage.f +++ b/lapack-netlib/SRC/csytrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/ctrexc.f b/lapack-netlib/SRC/ctrexc.f index ea64ddf9b..2bc0348fb 100644 --- a/lapack-netlib/SRC/ctrexc.f +++ b/lapack-netlib/SRC/ctrexc.f @@ -40,7 +40,7 @@ *> *> The Schur form T is reordered by a unitary similarity transformation *> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by -*> postmultplying it with Z. +*> postmultiplying it with Z. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/cunbdb1.f b/lapack-netlib/SRC/cunbdb1.f index 740e38a85..80faa8808 100644 --- a/lapack-netlib/SRC/cunbdb1.f +++ b/lapack-netlib/SRC/cunbdb1.f @@ -37,7 +37,7 @@ *>\verbatim *> *> CUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f index b45db6100..94b9fdbf9 100644 --- a/lapack-netlib/SRC/cunbdb2.f +++ b/lapack-netlib/SRC/cunbdb2.f @@ -37,7 +37,7 @@ *>\verbatim *> *> CUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/cunbdb3.f b/lapack-netlib/SRC/cunbdb3.f index b532bfbc8..f942bc698 100644 --- a/lapack-netlib/SRC/cunbdb3.f +++ b/lapack-netlib/SRC/cunbdb3.f @@ -37,7 +37,7 @@ *>\verbatim *> *> CUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f index 117f23d08..a551c184e 100644 --- a/lapack-netlib/SRC/cunbdb4.f +++ b/lapack-netlib/SRC/cunbdb4.f @@ -38,7 +38,7 @@ *>\verbatim *> *> CUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/dbdsvdx.f b/lapack-netlib/SRC/dbdsvdx.f index 4ccd4edad..4668a88f2 100644 --- a/lapack-netlib/SRC/dbdsvdx.f +++ b/lapack-netlib/SRC/dbdsvdx.f @@ -45,7 +45,7 @@ *> *> Given an upper bidiagonal B with diagonal D = [ d_1 d_2 ... d_N ] *> and superdiagonal E = [ e_1 e_2 ... e_N-1 ], DBDSVDX computes the -*> singular value decompositon of B through the eigenvalues and +*> singular value decomposition of B through the eigenvalues and *> eigenvectors of the N*2-by-N*2 tridiagonal matrix *> *> | 0 d_1 | diff --git a/lapack-netlib/SRC/dgejsv.f b/lapack-netlib/SRC/dgejsv.f index 798e9154d..1db85e9c2 100644 --- a/lapack-netlib/SRC/dgejsv.f +++ b/lapack-netlib/SRC/dgejsv.f @@ -253,7 +253,7 @@ *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; *> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), -*> then V is used as workspace if the pprocedure +*> then V is used as workspace if the procedure *> replaces A with A^t. In that case, [U] is computed *> in V as right singular vectors of A^t and then *> copied back to the U array. This 'W' option is just diff --git a/lapack-netlib/SRC/dgesvdq.f b/lapack-netlib/SRC/dgesvdq.f index 6f9ac703e..a514def8b 100644 --- a/lapack-netlib/SRC/dgesvdq.f +++ b/lapack-netlib/SRC/dgesvdq.f @@ -365,7 +365,7 @@ *> an optimal implementation would do all necessary scaling before calling *> CGESVD and the scaling in CGESVD can be switched off. *> 3. Other comments related to code optimization are given in comments in the -*> code, enlosed in [[double brackets]]. +*> code, enclosed in [[double brackets]]. *> \endverbatim * *> \par Bugs, examples and comments diff --git a/lapack-netlib/SRC/dgsvj0.f b/lapack-netlib/SRC/dgsvj0.f index 586723e64..1008aa8c6 100644 --- a/lapack-netlib/SRC/dgsvj0.f +++ b/lapack-netlib/SRC/dgsvj0.f @@ -52,10 +52,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim diff --git a/lapack-netlib/SRC/dgsvj1.f b/lapack-netlib/SRC/dgsvj1.f index d757bb927..f1a099b2a 100644 --- a/lapack-netlib/SRC/dgsvj1.f +++ b/lapack-netlib/SRC/dgsvj1.f @@ -75,10 +75,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim diff --git a/lapack-netlib/SRC/dla_gbrfsx_extended.f b/lapack-netlib/SRC/dla_gbrfsx_extended.f index 5454b1bfc..b331178a9 100644 --- a/lapack-netlib/SRC/dla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/dla_gbrfsx_extended.f @@ -645,7 +645,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/dla_gerfsx_extended.f b/lapack-netlib/SRC/dla_gerfsx_extended.f index 92b0d76d4..bf8260d04 100644 --- a/lapack-netlib/SRC/dla_gerfsx_extended.f +++ b/lapack-netlib/SRC/dla_gerfsx_extended.f @@ -625,7 +625,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL DAXPY( N, 1.0D+0, DY, 1, Y( 1, J ), 1 ) diff --git a/lapack-netlib/SRC/dla_porfsx_extended.f b/lapack-netlib/SRC/dla_porfsx_extended.f index 5c8850fef..94d1087cd 100644 --- a/lapack-netlib/SRC/dla_porfsx_extended.f +++ b/lapack-netlib/SRC/dla_porfsx_extended.f @@ -617,7 +617,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/dla_syrfsx_extended.f b/lapack-netlib/SRC/dla_syrfsx_extended.f index e1cde6fc2..2e06a622b 100644 --- a/lapack-netlib/SRC/dla_syrfsx_extended.f +++ b/lapack-netlib/SRC/dla_syrfsx_extended.f @@ -647,7 +647,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/dlaqz0.f b/lapack-netlib/SRC/dlaqz0.f index c4cb95fd3..84cb96bcb 100644 --- a/lapack-netlib/SRC/dlaqz0.f +++ b/lapack-netlib/SRC/dlaqz0.f @@ -102,7 +102,7 @@ *> Anal., 29(2006), pp. 199--227. *> *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift, -*> multipole rational QZ method with agressive early deflation" +*> multipole rational QZ method with aggressive early deflation" *> \endverbatim * * Arguments: @@ -332,7 +332,7 @@ CHARACTER :: JBCMPZ*3 * External Functions - EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, DLABAD, + EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, $ DLARTG, DROT DOUBLE PRECISION, EXTERNAL :: DLAMCH, DLANHS LOGICAL, EXTERNAL :: LSAME @@ -482,7 +482,6 @@ * Get machine constants SAFMIN = DLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE/SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) @@ -567,7 +566,7 @@ DO WHILE ( K.GE.ISTART2 ) IF( ABS( B( K, K ) ) .LT. BTOL ) THEN -* A diagonal element of B is negligable, move it +* A diagonal element of B is negligible, move it * to the top and deflate it DO K2 = K, ISTART2+1, -1 diff --git a/lapack-netlib/SRC/dlarfb_gett.f b/lapack-netlib/SRC/dlarfb_gett.f index 10ab6461e..2c7ea59b0 100644 --- a/lapack-netlib/SRC/dlarfb_gett.f +++ b/lapack-netlib/SRC/dlarfb_gett.f @@ -451,7 +451,7 @@ IF( LNOTIDENT ) THEN * * col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2, -* V1 is not an identy matrix, but unit lower-triangular +* V1 is not an identity matrix, but unit lower-triangular * V1 stored in A1 (diagonal ones are not stored). * * diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f index b4a98bc78..e6d78b672 100644 --- a/lapack-netlib/SRC/dlatrs3.f +++ b/lapack-netlib/SRC/dlatrs3.f @@ -574,7 +574,7 @@ * Prepare the linear update to be executed with GEMM. * For each column, compute a consistent scaling, a * scaling factor to survive the linear update, and -* rescale the column segments, if necesssary. Then +* rescale the column segments, if necessary. Then * the linear update is safely executed. * DO KK = 1, K2-K1 diff --git a/lapack-netlib/SRC/dorbdb1.f b/lapack-netlib/SRC/dorbdb1.f index dae18f5df..b5b2d1362 100644 --- a/lapack-netlib/SRC/dorbdb1.f +++ b/lapack-netlib/SRC/dorbdb1.f @@ -37,7 +37,7 @@ *>\verbatim *> *> DORBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/dorbdb2.f b/lapack-netlib/SRC/dorbdb2.f index a0dacbb16..0b4ad732c 100644 --- a/lapack-netlib/SRC/dorbdb2.f +++ b/lapack-netlib/SRC/dorbdb2.f @@ -37,7 +37,7 @@ *>\verbatim *> *> DORBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/dorbdb3.f b/lapack-netlib/SRC/dorbdb3.f index 7b3727956..79b10a5d3 100644 --- a/lapack-netlib/SRC/dorbdb3.f +++ b/lapack-netlib/SRC/dorbdb3.f @@ -37,7 +37,7 @@ *>\verbatim *> *> DORBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/dorbdb4.f b/lapack-netlib/SRC/dorbdb4.f index 08604be45..985be3277 100644 --- a/lapack-netlib/SRC/dorbdb4.f +++ b/lapack-netlib/SRC/dorbdb4.f @@ -38,7 +38,7 @@ *>\verbatim *> *> DORBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/dsyconvf.f b/lapack-netlib/SRC/dsyconvf.f index 9e7a5af0e..005c98a03 100644 --- a/lapack-netlib/SRC/dsyconvf.f +++ b/lapack-netlib/SRC/dsyconvf.f @@ -39,7 +39,7 @@ *> DSYTRF provided on entry in parameter A into the factorization *> output format used in DSYTRF_RK (or DSYTRF_BK) that is stored *> on exit in parameters A and E. It also converts in place details of -*> the intechanges stored in IPIV from the format used in DSYTRF into +*> the interchanges stored in IPIV from the format used in DSYTRF into *> the format used in DSYTRF_RK (or DSYTRF_BK). *> *> If parameter WAY = 'R': @@ -48,7 +48,7 @@ *> (or DSYTRF_BK) provided on entry in parameters A and E into *> the factorization output format used in DSYTRF that is stored *> on exit in parameter A. It also converts in place details of -*> the intechanges stored in IPIV from the format used in DSYTRF_RK +*> the interchanges stored in IPIV from the format used in DSYTRF_RK *> (or DSYTRF_BK) into the format used in DSYTRF. *> \endverbatim * @@ -322,7 +322,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -466,7 +466,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -532,7 +532,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/dsyconvf_rook.f b/lapack-netlib/SRC/dsyconvf_rook.f index d7f529358..c3f2083bd 100644 --- a/lapack-netlib/SRC/dsyconvf_rook.f +++ b/lapack-netlib/SRC/dsyconvf_rook.f @@ -517,7 +517,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f index 1100702ba..72fbe1e9a 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.f +++ b/lapack-netlib/SRC/dsysv_aa_2stage.f @@ -89,7 +89,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f index 346737953..9a0b26ce5 100644 --- a/lapack-netlib/SRC/dsytrf_aa.f +++ b/lapack-netlib/SRC/dsytrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f index b7f5f07c2..c65bd86e6 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/sbdsvdx.f b/lapack-netlib/SRC/sbdsvdx.f index dcd554f1d..d7cb2dc83 100644 --- a/lapack-netlib/SRC/sbdsvdx.f +++ b/lapack-netlib/SRC/sbdsvdx.f @@ -45,7 +45,7 @@ *> *> Given an upper bidiagonal B with diagonal D = [ d_1 d_2 ... d_N ] *> and superdiagonal E = [ e_1 e_2 ... e_N-1 ], SBDSVDX computes the -*> singular value decompositon of B through the eigenvalues and +*> singular value decomposition of B through the eigenvalues and *> eigenvectors of the N*2-by-N*2 tridiagonal matrix *> *> | 0 d_1 | diff --git a/lapack-netlib/SRC/sgejsv.f b/lapack-netlib/SRC/sgejsv.f index 1333e54fb..42d2b8313 100644 --- a/lapack-netlib/SRC/sgejsv.f +++ b/lapack-netlib/SRC/sgejsv.f @@ -253,7 +253,7 @@ *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; *> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), -*> then V is used as workspace if the pprocedure +*> then V is used as workspace if the procedure *> replaces A with A^t. In that case, [U] is computed *> in V as right singular vectors of A^t and then *> copied back to the U array. This 'W' option is just diff --git a/lapack-netlib/SRC/sgesvdq.f b/lapack-netlib/SRC/sgesvdq.f index ebbfc70b1..6fb328b8c 100644 --- a/lapack-netlib/SRC/sgesvdq.f +++ b/lapack-netlib/SRC/sgesvdq.f @@ -365,7 +365,7 @@ *> an optimal implementation would do all necessary scaling before calling *> CGESVD and the scaling in CGESVD can be switched off. *> 3. Other comments related to code optimization are given in comments in the -*> code, enlosed in [[double brackets]]. +*> code, enclosed in [[double brackets]]. *> \endverbatim * *> \par Bugs, examples and comments diff --git a/lapack-netlib/SRC/sgsvj0.f b/lapack-netlib/SRC/sgsvj0.f index 9249f4219..7e5fddeb0 100644 --- a/lapack-netlib/SRC/sgsvj0.f +++ b/lapack-netlib/SRC/sgsvj0.f @@ -52,10 +52,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim diff --git a/lapack-netlib/SRC/sgsvj1.f b/lapack-netlib/SRC/sgsvj1.f index 79fd4d2d3..f1d1f7239 100644 --- a/lapack-netlib/SRC/sgsvj1.f +++ b/lapack-netlib/SRC/sgsvj1.f @@ -75,10 +75,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim diff --git a/lapack-netlib/SRC/sla_gbrfsx_extended.f b/lapack-netlib/SRC/sla_gbrfsx_extended.f index 499d6bf90..c79330cb5 100644 --- a/lapack-netlib/SRC/sla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/sla_gbrfsx_extended.f @@ -644,7 +644,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/sla_gerfsx_extended.f b/lapack-netlib/SRC/sla_gerfsx_extended.f index de05d8eb4..1a19a5071 100644 --- a/lapack-netlib/SRC/sla_gerfsx_extended.f +++ b/lapack-netlib/SRC/sla_gerfsx_extended.f @@ -628,7 +628,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL SAXPY( N, 1.0, DY, 1, Y( 1, J ), 1 ) diff --git a/lapack-netlib/SRC/sla_porfsx_extended.f b/lapack-netlib/SRC/sla_porfsx_extended.f index ada4cad21..9f33b14dd 100644 --- a/lapack-netlib/SRC/sla_porfsx_extended.f +++ b/lapack-netlib/SRC/sla_porfsx_extended.f @@ -617,7 +617,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/sla_syrfsx_extended.f b/lapack-netlib/SRC/sla_syrfsx_extended.f index d5096be02..2fa3b1c50 100644 --- a/lapack-netlib/SRC/sla_syrfsx_extended.f +++ b/lapack-netlib/SRC/sla_syrfsx_extended.f @@ -647,7 +647,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f index 2e06f9d42..8b2d3286e 100644 --- a/lapack-netlib/SRC/slaqz0.f +++ b/lapack-netlib/SRC/slaqz0.f @@ -100,7 +100,7 @@ *> Anal., 29(2006), pp. 199--227. *> *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift, -*> multipole rational QZ method with agressive early deflation" +*> multipole rational QZ method with aggressive early deflation" *> \endverbatim * * Arguments: @@ -329,7 +329,7 @@ CHARACTER :: JBCMPZ*3 * External Functions - EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, SLABAD, + EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, $ SLARTG, SROT REAL, EXTERNAL :: SLAMCH, SLANHS LOGICAL, EXTERNAL :: LSAME @@ -479,7 +479,6 @@ * Get machine constants SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE/SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) @@ -564,7 +563,7 @@ DO WHILE ( K.GE.ISTART2 ) IF( ABS( B( K, K ) ) .LT. BTOL ) THEN -* A diagonal element of B is negligable, move it +* A diagonal element of B is negligible, move it * to the top and deflate it DO K2 = K, ISTART2+1, -1 diff --git a/lapack-netlib/SRC/slarfb_gett.f b/lapack-netlib/SRC/slarfb_gett.f index 7719f2965..f1fdef790 100644 --- a/lapack-netlib/SRC/slarfb_gett.f +++ b/lapack-netlib/SRC/slarfb_gett.f @@ -451,7 +451,7 @@ IF( LNOTIDENT ) THEN * * col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2, -* V1 is not an identy matrix, but unit lower-triangular +* V1 is not an identity matrix, but unit lower-triangular * V1 stored in A1 (diagonal ones are not stored). * * diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f index c3a08e524..8f0c4bf16 100644 --- a/lapack-netlib/SRC/slatrs3.f +++ b/lapack-netlib/SRC/slatrs3.f @@ -574,7 +574,7 @@ * Prepare the linear update to be executed with GEMM. * For each column, compute a consistent scaling, a * scaling factor to survive the linear update, and -* rescale the column segments, if necesssary. Then +* rescale the column segments, if necessary. Then * the linear update is safely executed. * DO KK = 1, K2-K1 diff --git a/lapack-netlib/SRC/sorbdb1.f b/lapack-netlib/SRC/sorbdb1.f index c860f4366..191e5742a 100644 --- a/lapack-netlib/SRC/sorbdb1.f +++ b/lapack-netlib/SRC/sorbdb1.f @@ -37,7 +37,7 @@ *>\verbatim *> *> SORBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/sorbdb2.f b/lapack-netlib/SRC/sorbdb2.f index 484d352f8..b2ff34bb1 100644 --- a/lapack-netlib/SRC/sorbdb2.f +++ b/lapack-netlib/SRC/sorbdb2.f @@ -37,7 +37,7 @@ *>\verbatim *> *> SORBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/sorbdb3.f b/lapack-netlib/SRC/sorbdb3.f index 6209b24ee..99478c5d0 100644 --- a/lapack-netlib/SRC/sorbdb3.f +++ b/lapack-netlib/SRC/sorbdb3.f @@ -37,7 +37,7 @@ *>\verbatim *> *> SORBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/sorbdb4.f b/lapack-netlib/SRC/sorbdb4.f index bf60fb7bb..0fef5b759 100644 --- a/lapack-netlib/SRC/sorbdb4.f +++ b/lapack-netlib/SRC/sorbdb4.f @@ -38,7 +38,7 @@ *>\verbatim *> *> SORBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/ssyconvf.f b/lapack-netlib/SRC/ssyconvf.f index 6defc1f0e..af55da51a 100644 --- a/lapack-netlib/SRC/ssyconvf.f +++ b/lapack-netlib/SRC/ssyconvf.f @@ -39,7 +39,7 @@ *> SSYTRF provided on entry in parameter A into the factorization *> output format used in SSYTRF_RK (or SSYTRF_BK) that is stored *> on exit in parameters A and E. It also converts in place details of -*> the intechanges stored in IPIV from the format used in SSYTRF into +*> the interchanges stored in IPIV from the format used in SSYTRF into *> the format used in SSYTRF_RK (or SSYTRF_BK). *> *> If parameter WAY = 'R': @@ -48,7 +48,7 @@ *> (or SSYTRF_BK) provided on entry in parameters A and E into *> the factorization output format used in SSYTRF that is stored *> on exit in parameter A. It also converts in place details of -*> the intechanges stored in IPIV from the format used in SSYTRF_RK +*> the interchanges stored in IPIV from the format used in SSYTRF_RK *> (or SSYTRF_BK) into the format used in SSYTRF. *> \endverbatim * @@ -322,7 +322,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -466,7 +466,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -532,7 +532,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/ssyconvf_rook.f b/lapack-netlib/SRC/ssyconvf_rook.f index c59f257bb..efd7c57fe 100644 --- a/lapack-netlib/SRC/ssyconvf_rook.f +++ b/lapack-netlib/SRC/ssyconvf_rook.f @@ -517,7 +517,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index b9eac8342..aa862f14b 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -88,7 +88,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f index 13e849cdc..4ba026fc8 100644 --- a/lapack-netlib/SRC/ssytrf_aa.f +++ b/lapack-netlib/SRC/ssytrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index 94b3aa4b5..07357f2ab 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index a36a9188a..b4bc531ab 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -252,7 +252,7 @@ *> If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of *> the right singular vectors; *> If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N), -*> then V is used as workspace if the pprocedure +*> then V is used as workspace if the procedure *> replaces A with A^*. In that case, [U] is computed *> in V as right singular vectors of A^* and then *> copied back to the U array. This 'W' option is just diff --git a/lapack-netlib/SRC/zgesvdq.f b/lapack-netlib/SRC/zgesvdq.f index 05eb722f8..b990f7389 100644 --- a/lapack-netlib/SRC/zgesvdq.f +++ b/lapack-netlib/SRC/zgesvdq.f @@ -363,7 +363,7 @@ *> an optimal implementation would do all necessary scaling before calling *> CGESVD and the scaling in CGESVD can be switched off. *> 3. Other comments related to code optimization are given in comments in the -*> code, enlosed in [[double brackets]]. +*> code, enclosed in [[double brackets]]. *> \endverbatim * *> \par Bugs, examples and comments diff --git a/lapack-netlib/SRC/zgsvj0.f b/lapack-netlib/SRC/zgsvj0.f index 11bfbd8dc..075c243c2 100644 --- a/lapack-netlib/SRC/zgsvj0.f +++ b/lapack-netlib/SRC/zgsvj0.f @@ -52,10 +52,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim diff --git a/lapack-netlib/SRC/zgsvj1.f b/lapack-netlib/SRC/zgsvj1.f index efe0384ef..ba71e155f 100644 --- a/lapack-netlib/SRC/zgsvj1.f +++ b/lapack-netlib/SRC/zgsvj1.f @@ -75,10 +75,10 @@ *> Specifies whether the output from this procedure is used *> to compute the matrix V: *> = 'V': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the N-by-N array V. +*> by postmultiplying the N-by-N array V. *> (See the description of V.) *> = 'A': the product of the Jacobi rotations is accumulated -*> by postmulyiplying the MV-by-N array V. +*> by postmultiplying the MV-by-N array V. *> (See the descriptions of MV and V.) *> = 'N': the Jacobi rotations are not accumulated. *> \endverbatim diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f index ee1596855..79c01c546 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.f +++ b/lapack-netlib/SRC/zhesv_aa_2stage.f @@ -88,7 +88,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/zhetf2_rk.f b/lapack-netlib/SRC/zhetf2_rk.f index 050c7993d..87df901aa 100644 --- a/lapack-netlib/SRC/zhetf2_rk.f +++ b/lapack-netlib/SRC/zhetf2_rk.f @@ -480,7 +480,7 @@ A( J, K ) = DCONJG( A( P, J ) ) A( P, J ) = T 14 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = DCONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( K, K ) ) @@ -508,7 +508,7 @@ A( J, KK ) = DCONJG( A( KP, J ) ) A( KP, J ) = T 15 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = DCONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( KK, KK ) ) @@ -834,7 +834,7 @@ A( J, K ) = DCONJG( A( P, J ) ) A( P, J ) = T 44 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = DCONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( K, K ) ) @@ -862,7 +862,7 @@ A( J, KK ) = DCONJG( A( KP, J ) ) A( KP, J ) = T 45 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = DCONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( KK, KK ) ) diff --git a/lapack-netlib/SRC/zhetf2_rook.f b/lapack-netlib/SRC/zhetf2_rook.f index 94bb29736..91172f601 100644 --- a/lapack-netlib/SRC/zhetf2_rook.f +++ b/lapack-netlib/SRC/zhetf2_rook.f @@ -420,7 +420,7 @@ A( J, K ) = DCONJG( A( P, J ) ) A( P, J ) = T 14 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = DCONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( K, K ) ) @@ -441,7 +441,7 @@ A( J, KK ) = DCONJG( A( KP, J ) ) A( KP, J ) = T 15 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = DCONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( KK, KK ) ) @@ -733,7 +733,7 @@ A( J, K ) = DCONJG( A( P, J ) ) A( P, J ) = T 44 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( P, K ) = DCONJG( A( P, K ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( K, K ) ) @@ -754,7 +754,7 @@ A( J, KK ) = DCONJG( A( KP, J ) ) A( KP, J ) = T 45 CONTINUE -* (3) Swap and conjugate corner elements at row-col interserction +* (3) Swap and conjugate corner elements at row-col intersection A( KP, KK ) = DCONJG( A( KP, KK ) ) * (4) Swap diagonal elements at row-col intersection R1 = DBLE( A( KK, KK ) ) diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f index 56722e7e6..537c16e8c 100644 --- a/lapack-netlib/SRC/zhetrf_aa.f +++ b/lapack-netlib/SRC/zhetrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f index 92f1c09b3..477602b5e 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/zla_gbrfsx_extended.f b/lapack-netlib/SRC/zla_gbrfsx_extended.f index fe4d635b1..2f57b7682 100644 --- a/lapack-netlib/SRC/zla_gbrfsx_extended.f +++ b/lapack-netlib/SRC/zla_gbrfsx_extended.f @@ -651,7 +651,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL ZAXPY( N, (1.0D+0,0.0D+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/zla_gerfsx_extended.f b/lapack-netlib/SRC/zla_gerfsx_extended.f index 9d618f294..22e45c5a9 100644 --- a/lapack-netlib/SRC/zla_gerfsx_extended.f +++ b/lapack-netlib/SRC/zla_gerfsx_extended.f @@ -636,7 +636,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL ZAXPY( N, (1.0D+0,0.0D+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/zla_herfsx_extended.f b/lapack-netlib/SRC/zla_herfsx_extended.f index a55dd9431..689460a02 100644 --- a/lapack-netlib/SRC/zla_herfsx_extended.f +++ b/lapack-netlib/SRC/zla_herfsx_extended.f @@ -655,7 +655,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/zla_porfsx_extended.f b/lapack-netlib/SRC/zla_porfsx_extended.f index 12e05e049..e853494fc 100644 --- a/lapack-netlib/SRC/zla_porfsx_extended.f +++ b/lapack-netlib/SRC/zla_porfsx_extended.f @@ -626,7 +626,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF (Y_PREC_STATE .LT. EXTRA_Y) THEN CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/zla_syrfsx_extended.f b/lapack-netlib/SRC/zla_syrfsx_extended.f index d6c241499..fb1b9e2d7 100644 --- a/lapack-netlib/SRC/zla_syrfsx_extended.f +++ b/lapack-netlib/SRC/zla_syrfsx_extended.f @@ -655,7 +655,7 @@ PREVNORMDX = NORMDX PREV_DZ_Z = DZ_Z * -* Update soluton. +* Update solution. * IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 ) diff --git a/lapack-netlib/SRC/zlaed7.f b/lapack-netlib/SRC/zlaed7.f index 83f32d8b8..86e5ec6b5 100644 --- a/lapack-netlib/SRC/zlaed7.f +++ b/lapack-netlib/SRC/zlaed7.f @@ -363,7 +363,7 @@ RETURN END IF * -* Prepare the INDXQ sorting premutation. +* Prepare the INDXQ sorting permutation. * N1 = K N2 = N - K diff --git a/lapack-netlib/SRC/zlaqz0.f b/lapack-netlib/SRC/zlaqz0.f index 3e20200ed..dcb28850a 100644 --- a/lapack-netlib/SRC/zlaqz0.f +++ b/lapack-netlib/SRC/zlaqz0.f @@ -89,7 +89,7 @@ *> Anal., 29(2006), pp. 199--227. *> *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift, -*> multipole rational QZ method with agressive early deflation" +*> multipole rational QZ method with aggressive early deflation" *> \endverbatim * * Arguments: @@ -312,7 +312,7 @@ CHARACTER :: JBCMPZ*3 * External Functions - EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, DLABAD, + EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, $ ZLARTG, ZROT DOUBLE PRECISION, EXTERNAL :: DLAMCH, ZLANHS LOGICAL, EXTERNAL :: LSAME @@ -464,7 +464,6 @@ * Get machine constants SAFMIN = DLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE/SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) @@ -535,7 +534,7 @@ DO WHILE ( K.GE.ISTART2 ) IF( ABS( B( K, K ) ) .LT. BTOL ) THEN -* A diagonal element of B is negligable, move it +* A diagonal element of B is negligible, move it * to the top and deflate it DO K2 = K, ISTART2+1, -1 diff --git a/lapack-netlib/SRC/zlarfb_gett.f b/lapack-netlib/SRC/zlarfb_gett.f index 4a3c4dcf1..17d4b33aa 100644 --- a/lapack-netlib/SRC/zlarfb_gett.f +++ b/lapack-netlib/SRC/zlarfb_gett.f @@ -452,7 +452,7 @@ IF( LNOTIDENT ) THEN * * col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2, -* V1 is not an identy matrix, but unit lower-triangular +* V1 is not an identity matrix, but unit lower-triangular * V1 stored in A1 (diagonal ones are not stored). * * diff --git a/lapack-netlib/SRC/zlatdf.f b/lapack-netlib/SRC/zlatdf.f index 25e71edce..41714a3e0 100644 --- a/lapack-netlib/SRC/zlatdf.f +++ b/lapack-netlib/SRC/zlatdf.f @@ -227,7 +227,7 @@ BM = RHS( J ) - CONE SPLUS = ONE * -* Lockahead for L- part RHS(1:N-1) = +-1 +* Look-ahead for L- part RHS(1:N-1) = +-1 * SPLUS and SMIN computed more efficiently than in BSOLVE[1]. * SPLUS = SPLUS + DBLE( ZDOTC( N-J, Z( J+1, J ), 1, Z( J+1, diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f index fc1be0517..231a17274 100644 --- a/lapack-netlib/SRC/zlatrs3.f +++ b/lapack-netlib/SRC/zlatrs3.f @@ -577,7 +577,7 @@ * Prepare the linear update to be executed with GEMM. * For each column, compute a consistent scaling, a * scaling factor to survive the linear update, and -* rescale the column segments, if necesssary. Then +* rescale the column segments, if necessary. Then * the linear update is safely executed. * DO KK = 1, K2 - K1 diff --git a/lapack-netlib/SRC/zsyconvf.f b/lapack-netlib/SRC/zsyconvf.f index eb49b0f3d..0958a5f77 100644 --- a/lapack-netlib/SRC/zsyconvf.f +++ b/lapack-netlib/SRC/zsyconvf.f @@ -39,7 +39,7 @@ *> ZSYTRF provided on entry in parameter A into the factorization *> output format used in ZSYTRF_RK (or ZSYTRF_BK) that is stored *> on exit in parameters A and E. It also converts in place details of -*> the intechanges stored in IPIV from the format used in ZSYTRF into +*> the interchanges stored in IPIV from the format used in ZSYTRF into *> the format used in ZSYTRF_RK (or ZSYTRF_BK). *> *> If parameter WAY = 'R': @@ -48,7 +48,7 @@ *> (or ZSYTRF_BK) provided on entry in parameters A and E into *> the factorization output format used in ZSYTRF that is stored *> on exit in parameter A. It also converts in place details of -*> the intechanges stored in IPIV from the format used in ZSYTRF_RK +*> the interchanges stored in IPIV from the format used in ZSYTRF_RK *> (or ZSYTRF_BK) into the format used in ZSYTRF. *> *> ZSYCONVF can also convert in Hermitian matrix case, i.e. between @@ -325,7 +325,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -469,7 +469,7 @@ END IF * * Convert IPIV -* There is no interchnge of rows i and and IPIV(i), +* There is no interchange of rows i and and IPIV(i), * so this should be reflected in IPIV format for * *SYTRF_RK ( or *SYTRF_BK) * @@ -535,7 +535,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/zsyconvf_rook.f b/lapack-netlib/SRC/zsyconvf_rook.f index 3cfa694c3..62cca060b 100644 --- a/lapack-netlib/SRC/zsyconvf_rook.f +++ b/lapack-netlib/SRC/zsyconvf_rook.f @@ -520,7 +520,7 @@ * * Revert VALUE * Assign subdiagonal entries of D from array E to -* subgiagonal entries of A. +* subdiagonal entries of A. * I = 1 DO WHILE ( I.LE.N-1 ) diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f index 4f19630a9..701d73a38 100644 --- a/lapack-netlib/SRC/zsysv_aa_2stage.f +++ b/lapack-netlib/SRC/zsysv_aa_2stage.f @@ -87,7 +87,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/zsytrf_aa.f b/lapack-netlib/SRC/zsytrf_aa.f index 73d257cfe..ddb19925a 100644 --- a/lapack-netlib/SRC/zsytrf_aa.f +++ b/lapack-netlib/SRC/zsytrf_aa.f @@ -74,7 +74,7 @@ *> *> On exit, the tridiagonal matrix is stored in the diagonals *> and the subdiagonals of A just below (or above) the diagonals, -*> and L is stored below (or above) the subdiaonals, when UPLO +*> and L is stored below (or above) the subdiagonals, when UPLO *> is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f index b731cb7a2..95b9fda0d 100644 --- a/lapack-netlib/SRC/zsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f @@ -75,7 +75,7 @@ *> triangular part of the matrix A, and the strictly upper *> triangular part of A is not referenced. *> -*> On exit, L is stored below (or above) the subdiaonal blocks, +*> On exit, L is stored below (or above) the subdiagonal blocks, *> when UPLO is 'L' (or 'U'). *> \endverbatim *> diff --git a/lapack-netlib/SRC/ztrexc.f b/lapack-netlib/SRC/ztrexc.f index b92e63efa..32aae51e4 100644 --- a/lapack-netlib/SRC/ztrexc.f +++ b/lapack-netlib/SRC/ztrexc.f @@ -40,7 +40,7 @@ *> *> The Schur form T is reordered by a unitary similarity transformation *> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by -*> postmultplying it with Z. +*> postmultiplying it with Z. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zunbdb1.f b/lapack-netlib/SRC/zunbdb1.f index 767788a74..2fae170de 100644 --- a/lapack-netlib/SRC/zunbdb1.f +++ b/lapack-netlib/SRC/zunbdb1.f @@ -37,7 +37,7 @@ *>\verbatim *> *> ZUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/zunbdb2.f b/lapack-netlib/SRC/zunbdb2.f index 46b08aa1e..28e78fc23 100644 --- a/lapack-netlib/SRC/zunbdb2.f +++ b/lapack-netlib/SRC/zunbdb2.f @@ -37,7 +37,7 @@ *>\verbatim *> *> ZUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/zunbdb3.f b/lapack-netlib/SRC/zunbdb3.f index 74083e41a..9f32a7a88 100644 --- a/lapack-netlib/SRC/zunbdb3.f +++ b/lapack-netlib/SRC/zunbdb3.f @@ -37,7 +37,7 @@ *>\verbatim *> *> ZUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/SRC/zunbdb4.f b/lapack-netlib/SRC/zunbdb4.f index 4672cfa67..a1db5eb79 100644 --- a/lapack-netlib/SRC/zunbdb4.f +++ b/lapack-netlib/SRC/zunbdb4.f @@ -38,7 +38,7 @@ *>\verbatim *> *> ZUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny -*> matrix X with orthonomal columns: +*> matrix X with orthonormal columns: *> *> [ B11 ] *> [ X11 ] [ P1 | ] [ 0 ] diff --git a/lapack-netlib/TESTING/EIG/cchkst.f b/lapack-netlib/TESTING/EIG/cchkst.f index 95747d051..ed535e91f 100644 --- a/lapack-netlib/TESTING/EIG/cchkst.f +++ b/lapack-netlib/TESTING/EIG/cchkst.f @@ -364,7 +364,7 @@ *> \verbatim *> D1 is REAL array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by CSTEQR simlutaneously +*> The eigenvalues of A, as computed by CSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -665,8 +665,7 @@ EXTERNAL CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, CLACPY, $ CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, CSTEMR, $ CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR, CUPGTR, - $ SCOPY, SLABAD, SLASUM, SSTEBZ, SSTECH, SSTERF, - $ XERBLA + $ SCOPY, SLASUM, SSTEBZ, SSTECH, SSTERF, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, CONJG, INT, LOG, MAX, MIN, REAL, SQRT @@ -733,7 +732,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/cchkst2stg.f b/lapack-netlib/TESTING/EIG/cchkst2stg.f index e4deb8ac8..668b51a36 100644 --- a/lapack-netlib/TESTING/EIG/cchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkst2stg.f @@ -385,7 +385,7 @@ *> \verbatim *> D1 is REAL array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by CSTEQR simlutaneously +*> The eigenvalues of A, as computed by CSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -683,10 +683,10 @@ EXTERNAL ILAENV, SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL SCOPY, SLABAD, SLASUM, SSTEBZ, SSTECH, SSTERF, - $ XERBLA, CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, - $ CLACPY, CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, - $ CSTEMR, CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR, + EXTERNAL SCOPY, SLASUM, SSTEBZ, SSTECH, SSTERF, XERBLA, + $ CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, CLACPY, + $ CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, CSTEMR, + $ CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR, $ CUPGTR, CHETRD_2STAGE, SLASET * .. * .. Intrinsic Functions .. @@ -754,7 +754,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/cdrgsx.f b/lapack-netlib/TESTING/EIG/cdrgsx.f index c5baeef5e..1729770a2 100644 --- a/lapack-netlib/TESTING/EIG/cdrgsx.f +++ b/lapack-netlib/TESTING/EIG/cdrgsx.f @@ -395,7 +395,7 @@ * .. * .. External Subroutines .. EXTERNAL ALASVM, CGESVD, CGET51, CGGESX, CLACPY, CLAKF2, - $ CLASET, CLATM5, SLABAD, XERBLA + $ CLASET, CLATM5, XERBLA * .. * .. Scalars in Common .. LOGICAL FS @@ -478,7 +478,6 @@ ULPINV = ONE / ULP SMLNUM = SLAMCH( 'S' ) / ULP BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) THRSH2 = TEN*THRESH NTESTT = 0 NERRS = 0 @@ -917,7 +916,7 @@ $ / ' 2: A and B are upper triangular matrices, ', $ / ' 3: A and B are as type 2, but each second diagonal ', $ 'block in A_11 and ', / - $ ' each third diaongal block in A_22 are 2x2 blocks,', + $ ' each third diagonal block in A_22 are 2x2 blocks,', $ / ' 4: A and B are block diagonal matrices, ', $ / ' 5: (A,B) has potentially close or common ', $ 'eigenvalues.', / ) diff --git a/lapack-netlib/TESTING/EIG/cdrvsg.f b/lapack-netlib/TESTING/EIG/cdrvsg.f index d15b39d01..729976738 100644 --- a/lapack-netlib/TESTING/EIG/cdrvsg.f +++ b/lapack-netlib/TESTING/EIG/cdrvsg.f @@ -236,7 +236,7 @@ *> *> B COMPLEX array, dimension (LDB , max(NN)) *> Used to hold the Hermitian positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -420,7 +420,7 @@ * .. External Subroutines .. EXTERNAL CHBGV, CHBGVD, CHBGVX, CHEGV, CHEGVD, CHEGVX, $ CHPGV, CHPGVD, CHPGVX, CLACPY, CLASET, CLATMR, - $ CLATMS, CSGT01, SLABAD, SLAFTS, SLASUM, XERBLA + $ CLATMS, CSGT01, SLAFTS, SLASUM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, REAL, SQRT @@ -481,7 +481,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/cdrvsg2stg.f b/lapack-netlib/TESTING/EIG/cdrvsg2stg.f index 8b8553773..f7d323247 100644 --- a/lapack-netlib/TESTING/EIG/cdrvsg2stg.f +++ b/lapack-netlib/TESTING/EIG/cdrvsg2stg.f @@ -242,7 +242,7 @@ *> *> B COMPLEX array, dimension (LDB , max(NN)) *> Used to hold the Hermitian positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -426,7 +426,7 @@ EXTERNAL LSAME, SLAMCH, SLARND * .. * .. External Subroutines .. - EXTERNAL SLABAD, SLAFTS, SLASUM, XERBLA, CHBGV, CHBGVD, + EXTERNAL SLAFTS, SLASUM, XERBLA, CHBGV, CHBGVD, $ CHBGVX, CHEGV, CHEGVD, CHEGVX, CHPGV, CHPGVD, $ CHPGVX, CLACPY, CLASET, CLATMR, CLATMS, CSGT01, $ CHEGV_2STAGE @@ -490,7 +490,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/cdrvst.f b/lapack-netlib/TESTING/EIG/cdrvst.f index 9c129c0e8..205f06f0d 100644 --- a/lapack-netlib/TESTING/EIG/cdrvst.f +++ b/lapack-netlib/TESTING/EIG/cdrvst.f @@ -204,7 +204,7 @@ *> Not modified. *> *> D1 REAL array, dimension (max(NN)) -*> The eigenvalues of A, as computed by CSTEQR simlutaneously +*> The eigenvalues of A, as computed by CSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -393,8 +393,8 @@ * .. External Subroutines .. EXTERNAL ALASVM, CHBEV, CHBEVD, CHBEVX, CHEEV, CHEEVD, $ CHEEVR, CHEEVX, CHET21, CHET22, CHPEV, CHPEVD, - $ CHPEVX, CLACPY, CLASET, CLATMR, CLATMS, SLABAD, - $ SLAFTS, XERBLA + $ CHPEVX, CLACPY, CLASET, CLATMR, CLATMS, SLAFTS, + $ XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, LOG, MAX, MIN, REAL, SQRT @@ -451,7 +451,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/cdrvst2stg.f b/lapack-netlib/TESTING/EIG/cdrvst2stg.f index 954c7fb87..258f1f370 100644 --- a/lapack-netlib/TESTING/EIG/cdrvst2stg.f +++ b/lapack-netlib/TESTING/EIG/cdrvst2stg.f @@ -204,7 +204,7 @@ *> Not modified. *> *> D1 REAL array, dimension (max(NN)) -*> The eigenvalues of A, as computed by CSTEQR simlutaneously +*> The eigenvalues of A, as computed by CSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -391,7 +391,7 @@ EXTERNAL SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, SLABAD, SLAFTS, XERBLA, CHBEV, CHBEVD, + EXTERNAL ALASVM, SLAFTS, XERBLA, CHBEV, CHBEVD, $ CHBEVX, CHEEV, CHEEVD, CHEEVR, CHEEVX, CHET21, $ CHET22, CHPEV, CHPEVD, CHPEVX, CLACPY, CLASET, $ CHEEVD_2STAGE, CHEEVR_2STAGE, CHEEVX_2STAGE, @@ -453,7 +453,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f index eef34b44b..18b5cad36 100644 --- a/lapack-netlib/TESTING/EIG/cerrst.f +++ b/lapack-netlib/TESTING/EIG/cerrst.f @@ -628,56 +628,56 @@ SRNAMT = 'CHEEVX_2STAGE' INFOT = 1 CALL CHEEVX_2STAGE( '/', 'A', 'U', 0, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 1 CALL CHEEVX_2STAGE( 'V', 'A', 'U', 0, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHEEVX_2STAGE( 'N', '/', 'U', 0, A, 1, - $ 0.0D0, 1.0D0, 1, 0, 0.0D0, + $ 0.0, 1.0, 1, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEEVX_2STAGE( 'N', 'A', '/', 0, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) INFOT = 4 CALL CHEEVX_2STAGE( 'N', 'A', 'U', -1, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 2, W, 3, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CHEEVX_2STAGE( 'N', 'V', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEEVX_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 1, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHEEVX_2STAGE( 'N', 'I', 'U', 2, A, 2, - $ 0.0D0, 0.0D0, 2, 1, 0.0D0, + $ 0.0, 0.0, 2, 1, 0.0, $ M, X, Z, 2, W, 3, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 15 CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 2, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 0, W, 3, RW, IW, I3, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 17 CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 2, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 2, W, 0, RW, IW, I1, INFO ) CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 11 @@ -755,79 +755,79 @@ N = 1 INFOT = 1 CALL CHEEVR_2STAGE( '/', 'A', 'U', 0, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 1 CALL CHEEVR_2STAGE( 'V', 'A', 'U', 0, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHEEVR_2STAGE( 'N', '/', 'U', 0, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHEEVR_2STAGE( 'N', 'A', '/', -1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, $ IW( 2*N+1 ), 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 4 CALL CHEEVR_2STAGE( 'N', 'A', 'U', -1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, $ IW( 2*N+1 ), 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 6 CALL CHEEVR_2STAGE( 'N', 'A', 'U', 2, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CHEEVR_2STAGE( 'N', 'V', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 9 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 0, 1, 0.0D0, + $ 0.0, 0.0, 0, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 2, A, 2, - $ 0.0D0, 0.0D0, 2, 1, 0.0D0, + $ 0.0, 0.0, 2, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 15 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 0, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, - $ 0.0D0, 0.0D0, 1, 1, 0.0D0, + $ 0.0, 0.0, 1, 1, 0.0, $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, $ INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) @@ -1259,65 +1259,65 @@ SRNAMT = 'CHBEVX_2STAGE' INFOT = 1 CALL CHBEVX_2STAGE( '/', 'A', 'U', 0, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) INFOT = 1 CALL CHBEVX_2STAGE( 'V', 'A', 'U', 0, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 2 CALL CHBEVX_2STAGE( 'N', '/', 'U', 0, 0, A, 1, Q, 1, - $ 0.0D0, 1.0D0, 1, 0, 0.0D0, + $ 0.0, 1.0, 1, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 3 CALL CHBEVX_2STAGE( 'N', 'A', '/', 0, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) INFOT = 4 CALL CHBEVX_2STAGE( 'N', 'A', 'U', -1, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CHBEVX_2STAGE( 'N', 'A', 'U', 0, -1, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 7 CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 1, A, 1, Q, 2, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 2, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) * INFOT = 9 * CALL CHBEVX_2STAGE( 'V', 'A', 'U', 2, 0, A, 1, Q, 1, -* $ 0.0D0, 0.0D0, 0, 0, 0.0D0, +* $ 0.0, 0.0, 0, 0, 0.0, * $ M, X, Z, 2, W, 0, RW, IW, I3, INFO ) * CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CHBEVX_2STAGE( 'N', 'V', 'U', 1, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHBEVX_2STAGE( 'N', 'I', 'U', 1, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 13 CALL CHBEVX_2STAGE( 'N', 'I', 'U', 1, 0, A, 1, Q, 1, - $ 0.0D0, 0.0D0, 1, 2, 0.0D0, + $ 0.0, 0.0, 1, 2, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 0, A, 1, Q, 2, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 0, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 0, A, 1, Q, 2, - $ 0.0D0, 0.0D0, 0, 0, 0.0D0, + $ 0.0, 0.0, 0, 0, 0.0, $ M, X, Z, 1, W, 0, RW, IW, I3, INFO ) CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 12 diff --git a/lapack-netlib/TESTING/EIG/dchkst.f b/lapack-netlib/TESTING/EIG/dchkst.f index 2e04f68c5..6e02c84fb 100644 --- a/lapack-netlib/TESTING/EIG/dchkst.f +++ b/lapack-netlib/TESTING/EIG/dchkst.f @@ -363,7 +363,7 @@ *> \verbatim *> D1 is DOUBLE PRECISION array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by DSTEQR simlutaneously +*> The eigenvalues of A, as computed by DSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -645,10 +645,10 @@ EXTERNAL ILAENV, DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL DCOPY, DLABAD, DLACPY, DLASET, DLASUM, DLATMR, - $ DLATMS, DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, - $ DSTEBZ, DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, - $ DSTERF, DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA + EXTERNAL DCOPY, DLACPY, DLASET, DLASUM, DLATMR, DLATMS, + $ DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, DSTEBZ, + $ DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, DSTERF, + $ DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, LOG, MAX, MIN, SQRT @@ -715,7 +715,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/dchkst2stg.f b/lapack-netlib/TESTING/EIG/dchkst2stg.f index 2c98b802d..0fbebc8b7 100644 --- a/lapack-netlib/TESTING/EIG/dchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/dchkst2stg.f @@ -384,7 +384,7 @@ *> \verbatim *> D1 is DOUBLE PRECISION array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by DSTEQR simlutaneously +*> The eigenvalues of A, as computed by DSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -666,10 +666,10 @@ EXTERNAL ILAENV, DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL DCOPY, DLABAD, DLACPY, DLASET, DLASUM, DLATMR, - $ DLATMS, DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, - $ DSTEBZ, DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, - $ DSTERF, DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA, + EXTERNAL DCOPY, DLACPY, DLASET, DLASUM, DLATMR, DLATMS, + $ DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, DSTEBZ, + $ DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, DSTERF, + $ DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA, $ DSYTRD_2STAGE * .. * .. Intrinsic Functions .. @@ -737,7 +737,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/ddrgsx.f b/lapack-netlib/TESTING/EIG/ddrgsx.f index b3f5e23f4..82470ead3 100644 --- a/lapack-netlib/TESTING/EIG/ddrgsx.f +++ b/lapack-netlib/TESTING/EIG/ddrgsx.f @@ -400,7 +400,7 @@ EXTERNAL DLCTSX, ILAENV, DLAMCH, DLANGE * .. * .. External Subroutines .. - EXTERNAL ALASVM, DGESVD, DGET51, DGET53, DGGESX, DLABAD, + EXTERNAL ALASVM, DGESVD, DGET51, DGET53, DGGESX, $ DLACPY, DLAKF2, DLASET, DLATM5, XERBLA * .. * .. Intrinsic Functions .. @@ -478,7 +478,6 @@ ULPINV = ONE / ULP SMLNUM = DLAMCH( 'S' ) / ULP BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) THRSH2 = TEN*THRESH NTESTT = 0 NERRS = 0 @@ -984,7 +983,7 @@ $ / ' 2: A and B are upper triangular matrices, ', $ / ' 3: A and B are as type 2, but each second diagonal ', $ 'block in A_11 and ', / - $ ' each third diaongal block in A_22 are 2x2 blocks,', + $ ' each third diagonal block in A_22 are 2x2 blocks,', $ / ' 4: A and B are block diagonal matrices, ', $ / ' 5: (A,B) has potentially close or common ', $ 'eigenvalues.', / ) diff --git a/lapack-netlib/TESTING/EIG/ddrvsg.f b/lapack-netlib/TESTING/EIG/ddrvsg.f index 2e9d3c643..72c373086 100644 --- a/lapack-netlib/TESTING/EIG/ddrvsg.f +++ b/lapack-netlib/TESTING/EIG/ddrvsg.f @@ -234,7 +234,7 @@ *> *> B DOUBLE PRECISION array, dimension (LDB , max(NN)) *> Used to hold the symmetric positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -399,7 +399,7 @@ EXTERNAL LSAME, DLAMCH, DLARND * .. * .. External Subroutines .. - EXTERNAL DLABAD, DLACPY, DLAFTS, DLASET, DLASUM, DLATMR, + EXTERNAL DLACPY, DLAFTS, DLASET, DLASUM, DLATMR, $ DLATMS, DSBGV, DSBGVD, DSBGVX, DSGT01, DSPGV, $ DSPGVD, DSPGVX, DSYGV, DSYGVD, DSYGVX, XERBLA * .. @@ -460,7 +460,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/ddrvsg2stg.f b/lapack-netlib/TESTING/EIG/ddrvsg2stg.f index 196c6b48e..0fe31cab1 100644 --- a/lapack-netlib/TESTING/EIG/ddrvsg2stg.f +++ b/lapack-netlib/TESTING/EIG/ddrvsg2stg.f @@ -240,7 +240,7 @@ *> *> B DOUBLE PRECISION array, dimension (LDB , max(NN)) *> Used to hold the symmetric positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -408,7 +408,7 @@ EXTERNAL LSAME, DLAMCH, DLARND * .. * .. External Subroutines .. - EXTERNAL DLABAD, DLACPY, DLAFTS, DLASET, DLASUM, DLATMR, + EXTERNAL DLACPY, DLAFTS, DLASET, DLASUM, DLATMR, $ DLATMS, DSBGV, DSBGVD, DSBGVX, DSGT01, DSPGV, $ DSPGVD, DSPGVX, DSYGV, DSYGVD, DSYGVX, XERBLA, $ DSYGV_2STAGE @@ -470,7 +470,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/ddrvst.f b/lapack-netlib/TESTING/EIG/ddrvst.f index a25077018..805fd8271 100644 --- a/lapack-netlib/TESTING/EIG/ddrvst.f +++ b/lapack-netlib/TESTING/EIG/ddrvst.f @@ -214,7 +214,7 @@ *> Not modified. *> *> D1 DOUBLE PRECISION array, dimension (max(NN)) -*> The eigenvalues of A, as computed by DSTEQR simlutaneously +*> The eigenvalues of A, as computed by DSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -502,11 +502,11 @@ EXTERNAL DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, DLABAD, DLACPY, DLAFTS, DLASET, DLATMR, - $ DLATMS, DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD, - $ DSPEVX, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21, - $ DSTT22, DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21, - $ DSYT22, XERBLA + EXTERNAL ALASVM, DLACPY, DLAFTS, DLASET, DLATMR, DLATMS, + $ DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD, DSPEVX, + $ DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21, DSTT22, + $ DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21, DSYT22, + $ XERBLA * .. * .. Scalars in Common .. CHARACTER*32 SRNAMT @@ -574,7 +574,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/ddrvst2stg.f b/lapack-netlib/TESTING/EIG/ddrvst2stg.f index c9a2632bb..e38671e38 100644 --- a/lapack-netlib/TESTING/EIG/ddrvst2stg.f +++ b/lapack-netlib/TESTING/EIG/ddrvst2stg.f @@ -214,7 +214,7 @@ *> Not modified. *> *> D1 DOUBLE PRECISION array, dimension (max(NN)) -*> The eigenvalues of A, as computed by DSTEQR simlutaneously +*> The eigenvalues of A, as computed by DSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -502,7 +502,7 @@ EXTERNAL DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, DLABAD, DLACPY, DLAFTS, DLASET, DLATMR, + EXTERNAL ALASVM, DLACPY, DLAFTS, DLASET, DLATMR, $ DLATMS, DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD, $ DSPEVX, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21, $ DSTT22, DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21, @@ -577,7 +577,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/schkst.f b/lapack-netlib/TESTING/EIG/schkst.f index 10622d77a..aecbdfe93 100644 --- a/lapack-netlib/TESTING/EIG/schkst.f +++ b/lapack-netlib/TESTING/EIG/schkst.f @@ -363,7 +363,7 @@ *> \verbatim *> D1 is REAL array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by SSTEQR simlutaneously +*> The eigenvalues of A, as computed by SSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -645,10 +645,10 @@ EXTERNAL ILAENV, SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL SCOPY, SLABAD, SLACPY, SLASET, SLASUM, SLATMR, - $ SLATMS, SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, - $ SSTEBZ, SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, - $ SSTERF, SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA + EXTERNAL SCOPY, SLACPY, SLASET, SLASUM, SLATMR, SLATMS, + $ SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, SSTEBZ, + $ SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, SSTERF, + $ SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, LOG, MAX, MIN, REAL, SQRT @@ -715,7 +715,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/schkst2stg.f b/lapack-netlib/TESTING/EIG/schkst2stg.f index ac5a3fc39..49899a660 100644 --- a/lapack-netlib/TESTING/EIG/schkst2stg.f +++ b/lapack-netlib/TESTING/EIG/schkst2stg.f @@ -384,7 +384,7 @@ *> \verbatim *> D1 is REAL array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by SSTEQR simlutaneously +*> The eigenvalues of A, as computed by SSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -666,10 +666,10 @@ EXTERNAL ILAENV, SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL SCOPY, SLABAD, SLACPY, SLASET, SLASUM, SLATMR, - $ SLATMS, SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, - $ SSTEBZ, SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, - $ SSTERF, SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA, + EXTERNAL SCOPY, SLACPY, SLASET, SLASUM, SLATMR, SLATMS, + $ SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, SSTEBZ, + $ SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, SSTERF, + $ SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA, $ SSYTRD_2STAGE * .. * .. Intrinsic Functions .. @@ -737,7 +737,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/sdrgsx.f b/lapack-netlib/TESTING/EIG/sdrgsx.f index ce0d59214..2015cac1c 100644 --- a/lapack-netlib/TESTING/EIG/sdrgsx.f +++ b/lapack-netlib/TESTING/EIG/sdrgsx.f @@ -400,7 +400,7 @@ EXTERNAL SLCTSX, ILAENV, SLAMCH, SLANGE * .. * .. External Subroutines .. - EXTERNAL ALASVM, SGESVD, SGET51, SGET53, SGGESX, SLABAD, + EXTERNAL ALASVM, SGESVD, SGET51, SGET53, SGGESX, $ SLACPY, SLAKF2, SLASET, SLATM5, XERBLA * .. * .. Intrinsic Functions .. @@ -479,7 +479,6 @@ c MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 ) ULPINV = ONE / ULP SMLNUM = SLAMCH( 'S' ) / ULP BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) THRSH2 = TEN*THRESH NTESTT = 0 NERRS = 0 @@ -985,7 +984,7 @@ c MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 ) $ / ' 2: A and B are upper triangular matrices, ', $ / ' 3: A and B are as type 2, but each second diagonal ', $ 'block in A_11 and ', / - $ ' each third diaongal block in A_22 are 2x2 blocks,', + $ ' each third diagonal block in A_22 are 2x2 blocks,', $ / ' 4: A and B are block diagonal matrices, ', $ / ' 5: (A,B) has potentially close or common ', $ 'eigenvalues.', / ) diff --git a/lapack-netlib/TESTING/EIG/sdrvsg.f b/lapack-netlib/TESTING/EIG/sdrvsg.f index 877579bcd..0c82b6f49 100644 --- a/lapack-netlib/TESTING/EIG/sdrvsg.f +++ b/lapack-netlib/TESTING/EIG/sdrvsg.f @@ -234,7 +234,7 @@ *> *> B REAL array, dimension (LDB , max(NN)) *> Used to hold the symmetric positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -399,7 +399,7 @@ EXTERNAL LSAME, SLAMCH, SLARND * .. * .. External Subroutines .. - EXTERNAL SLABAD, SLACPY, SLAFTS, SLASET, SLASUM, SLATMR, + EXTERNAL SLACPY, SLAFTS, SLASET, SLASUM, SLATMR, $ SLATMS, SSBGV, SSBGVD, SSBGVX, SSGT01, SSPGV, $ SSPGVD, SSPGVX, SSYGV, SSYGVD, SSYGVX, XERBLA * .. @@ -460,7 +460,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/sdrvsg2stg.f b/lapack-netlib/TESTING/EIG/sdrvsg2stg.f index ebd169977..38ed61628 100644 --- a/lapack-netlib/TESTING/EIG/sdrvsg2stg.f +++ b/lapack-netlib/TESTING/EIG/sdrvsg2stg.f @@ -240,7 +240,7 @@ *> *> B REAL array, dimension (LDB , max(NN)) *> Used to hold the symmetric positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -408,7 +408,7 @@ EXTERNAL LSAME, SLAMCH, SLARND * .. * .. External Subroutines .. - EXTERNAL SLABAD, SLACPY, SLAFTS, SLASET, SLASUM, SLATMR, + EXTERNAL SLACPY, SLAFTS, SLASET, SLASUM, SLATMR, $ SLATMS, SSBGV, SSBGVD, SSBGVX, SSGT01, SSPGV, $ SSPGVD, SSPGVX, SSYGV, SSYGVD, SSYGVX, XERBLA, $ SSYGV_2STAGE @@ -470,7 +470,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/sdrvst.f b/lapack-netlib/TESTING/EIG/sdrvst.f index ea0cf66f9..be6d33cee 100644 --- a/lapack-netlib/TESTING/EIG/sdrvst.f +++ b/lapack-netlib/TESTING/EIG/sdrvst.f @@ -214,7 +214,7 @@ *> Not modified. *> *> D1 REAL array, dimension (max(NN)) -*> The eigenvalues of A, as computed by SSTEQR simlutaneously +*> The eigenvalues of A, as computed by SSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -502,11 +502,11 @@ EXTERNAL SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, SLABAD, SLACPY, SLAFTS, SLASET, SLATMR, - $ SLATMS, SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD, - $ SSPEVX, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21, - $ SSTT22, SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21, - $ SSYT22, XERBLA + EXTERNAL ALASVM, SLACPY, SLAFTS, SLASET, SLATMR, SLATMS, + $ SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD, SSPEVX, + $ SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21, SSTT22, + $ SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21, SSYT22, + $ XERBLA * .. * .. Scalars in Common .. CHARACTER*32 SRNAMT @@ -574,7 +574,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/sdrvst2stg.f b/lapack-netlib/TESTING/EIG/sdrvst2stg.f index a13a58b48..e05ec0749 100644 --- a/lapack-netlib/TESTING/EIG/sdrvst2stg.f +++ b/lapack-netlib/TESTING/EIG/sdrvst2stg.f @@ -214,7 +214,7 @@ *> Not modified. *> *> D1 REAL array, dimension (max(NN)) -*> The eigenvalues of A, as computed by SSTEQR simlutaneously +*> The eigenvalues of A, as computed by SSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -502,7 +502,7 @@ EXTERNAL SLAMCH, SLARND, SSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, SLABAD, SLACPY, SLAFTS, SLASET, SLATMR, + EXTERNAL ALASVM, SLACPY, SLAFTS, SLASET, SLATMR, $ SLATMS, SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD, $ SSPEVX, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21, $ SSTT22, SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21, @@ -577,7 +577,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = SLAMCH( 'Overflow' ) - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/zchkst.f b/lapack-netlib/TESTING/EIG/zchkst.f index 60496dde1..b77d94438 100644 --- a/lapack-netlib/TESTING/EIG/zchkst.f +++ b/lapack-netlib/TESTING/EIG/zchkst.f @@ -364,7 +364,7 @@ *> \verbatim *> D1 is DOUBLE PRECISION array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by ZSTEQR simlutaneously +*> The eigenvalues of A, as computed by ZSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -662,11 +662,10 @@ EXTERNAL ILAENV, DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL DCOPY, DLABAD, DLASUM, DSTEBZ, DSTECH, DSTERF, - $ XERBLA, ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, - $ ZLACPY, ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, - $ ZSTEMR, ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, - $ ZUPGTR + EXTERNAL DCOPY, DLASUM, DSTEBZ, DSTECH, DSTERF, XERBLA, + $ ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, ZLACPY, + $ ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, ZSTEMR, + $ ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, ZUPGTR * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, INT, LOG, MAX, MIN, SQRT @@ -733,7 +732,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/zchkst2stg.f b/lapack-netlib/TESTING/EIG/zchkst2stg.f index b1ef80816..3b333ef1c 100644 --- a/lapack-netlib/TESTING/EIG/zchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/zchkst2stg.f @@ -385,7 +385,7 @@ *> \verbatim *> D1 is DOUBLE PRECISION array of *> dimension( max(NN) ) -*> The eigenvalues of A, as computed by ZSTEQR simlutaneously +*> The eigenvalues of A, as computed by ZSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> \endverbatim @@ -683,11 +683,11 @@ EXTERNAL ILAENV, DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL DCOPY, DLABAD, DLASUM, DSTEBZ, DSTECH, DSTERF, - $ XERBLA, ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, - $ ZLACPY, ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, - $ ZSTEMR, ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, - $ ZUPGTR, ZHETRD_2STAGE, DLASET + EXTERNAL DCOPY, DLASUM, DSTEBZ, DSTECH, DSTERF, XERBLA, + $ ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, ZLACPY, + $ ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, ZSTEMR, + $ ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, ZUPGTR, + $ ZHETRD_2STAGE, DLASET * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, INT, LOG, MAX, MIN, SQRT @@ -754,7 +754,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) ) diff --git a/lapack-netlib/TESTING/EIG/zdrgsx.f b/lapack-netlib/TESTING/EIG/zdrgsx.f index 096ed3c22..a486873fa 100644 --- a/lapack-netlib/TESTING/EIG/zdrgsx.f +++ b/lapack-netlib/TESTING/EIG/zdrgsx.f @@ -394,7 +394,7 @@ EXTERNAL ZLCTSX, ILAENV, DLAMCH, ZLANGE * .. * .. External Subroutines .. - EXTERNAL ALASVM, DLABAD, XERBLA, ZGESVD, ZGET51, ZGGESX, + EXTERNAL ALASVM, XERBLA, ZGESVD, ZGET51, ZGGESX, $ ZLACPY, ZLAKF2, ZLASET, ZLATM5 * .. * .. Scalars in Common .. @@ -479,7 +479,6 @@ ULPINV = ONE / ULP SMLNUM = DLAMCH( 'S' ) / ULP BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) THRSH2 = TEN*THRESH NTESTT = 0 NERRS = 0 @@ -918,7 +917,7 @@ $ / ' 2: A and B are upper triangular matrices, ', $ / ' 3: A and B are as type 2, but each second diagonal ', $ 'block in A_11 and ', / - $ ' each third diaongal block in A_22 are 2x2 blocks,', + $ ' each third diagonal block in A_22 are 2x2 blocks,', $ / ' 4: A and B are block diagonal matrices, ', $ / ' 5: (A,B) has potentially close or common ', $ 'eigenvalues.', / ) diff --git a/lapack-netlib/TESTING/EIG/zdrvsg.f b/lapack-netlib/TESTING/EIG/zdrvsg.f index 71f1d6371..1796805ea 100644 --- a/lapack-netlib/TESTING/EIG/zdrvsg.f +++ b/lapack-netlib/TESTING/EIG/zdrvsg.f @@ -236,7 +236,7 @@ *> *> B COMPLEX*16 array, dimension (LDB , max(NN)) *> Used to hold the Hermitian positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -418,7 +418,7 @@ EXTERNAL LSAME, DLAMCH, DLARND * .. * .. External Subroutines .. - EXTERNAL DLABAD, DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD, + EXTERNAL DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD, $ ZHBGVX, ZHEGV, ZHEGVD, ZHEGVX, ZHPGV, ZHPGVD, $ ZHPGVX, ZLACPY, ZLASET, ZLATMR, ZLATMS, ZSGT01 * .. @@ -481,7 +481,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/zdrvsg2stg.f b/lapack-netlib/TESTING/EIG/zdrvsg2stg.f index 4bdf2849e..c5ef4ce70 100644 --- a/lapack-netlib/TESTING/EIG/zdrvsg2stg.f +++ b/lapack-netlib/TESTING/EIG/zdrvsg2stg.f @@ -242,7 +242,7 @@ *> *> B COMPLEX*16 array, dimension (LDB , max(NN)) *> Used to hold the Hermitian positive definite matrix for -*> the generailzed problem. +*> the generalized problem. *> On exit, B contains the last matrix actually *> used. *> Modified. @@ -426,7 +426,7 @@ EXTERNAL LSAME, DLAMCH, DLARND * .. * .. External Subroutines .. - EXTERNAL DLABAD, DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD, + EXTERNAL DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD, $ ZHBGVX, ZHEGV, ZHEGVD, ZHEGVX, ZHPGV, ZHPGVD, $ ZHPGVX, ZLACPY, ZLASET, ZLATMR, ZLATMS, ZSGT01, $ ZHEGV_2STAGE @@ -490,7 +490,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/zdrvst.f b/lapack-netlib/TESTING/EIG/zdrvst.f index 384e58de1..f838f0af2 100644 --- a/lapack-netlib/TESTING/EIG/zdrvst.f +++ b/lapack-netlib/TESTING/EIG/zdrvst.f @@ -204,7 +204,7 @@ *> Not modified. *> *> D1 DOUBLE PRECISION array, dimension (max(NN)) -*> The eigenvalues of A, as computed by ZSTEQR simlutaneously +*> The eigenvalues of A, as computed by ZSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -391,10 +391,10 @@ EXTERNAL DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, DLABAD, DLAFTS, XERBLA, ZHBEV, ZHBEVD, - $ ZHBEVX, ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21, - $ ZHET22, ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET, - $ ZLATMR, ZLATMS + EXTERNAL ALASVM, DLAFTS, XERBLA, ZHBEV, ZHBEVD, ZHBEVX, + $ ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21, ZHET22, + $ ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET, ZLATMR, + $ ZLATMS * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, INT, LOG, MAX, MIN, SQRT @@ -451,7 +451,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/EIG/zdrvst2stg.f b/lapack-netlib/TESTING/EIG/zdrvst2stg.f index 4a88e5218..4b989b460 100644 --- a/lapack-netlib/TESTING/EIG/zdrvst2stg.f +++ b/lapack-netlib/TESTING/EIG/zdrvst2stg.f @@ -204,7 +204,7 @@ *> Not modified. *> *> D1 DOUBLE PRECISION array, dimension (max(NN)) -*> The eigenvalues of A, as computed by ZSTEQR simlutaneously +*> The eigenvalues of A, as computed by ZSTEQR simultaneously *> with Z. On exit, the eigenvalues in D1 correspond with the *> matrix in A. *> Modified. @@ -391,7 +391,7 @@ EXTERNAL DLAMCH, DLARND, DSXT1 * .. * .. External Subroutines .. - EXTERNAL ALASVM, DLABAD, DLAFTS, XERBLA, ZHBEV, ZHBEVD, + EXTERNAL ALASVM, DLAFTS, XERBLA, ZHBEV, ZHBEVD, $ ZHBEVX, ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21, $ ZHET22, ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET, $ ZHEEVD_2STAGE, ZHEEVR_2STAGE, ZHEEVX_2STAGE, @@ -453,7 +453,6 @@ * UNFL = DLAMCH( 'Safe minimum' ) OVFL = DLAMCH( 'Overflow' ) - CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' ) ULPINV = ONE / ULP RTUNFL = SQRT( UNFL ) diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f index f0423a23b..dd75394b3 100644 --- a/lapack-netlib/TESTING/LIN/alahd.f +++ b/lapack-netlib/TESTING/LIN/alahd.f @@ -777,7 +777,7 @@ $ 'triangular-pentagonal matrices' ) 8004 FORMAT( / 1X, A3, ': TS factorization for ', $ 'tall-skinny or short-wide matrices' ) - 8005 FORMAT( / 1X, A3, ': Householder recostruction from TSQR', + 8005 FORMAT( / 1X, A3, ': Householder reconstruction from TSQR', $ ' factorization output ', /,' for tall-skinny matrices.' ) * * GE matrix types diff --git a/lapack-netlib/TESTING/LIN/cchktp.f b/lapack-netlib/TESTING/LIN/cchktp.f index 18242ff54..e14f1062e 100644 --- a/lapack-netlib/TESTING/LIN/cchktp.f +++ b/lapack-netlib/TESTING/LIN/cchktp.f @@ -87,7 +87,7 @@ *> \verbatim *> NMAX is INTEGER *> The leading dimension of the work arrays. NMAX >= the -*> maximumm value of N in NVAL. +*> maximum value of N in NVAL. *> \endverbatim *> *> \param[out] AP diff --git a/lapack-netlib/TESTING/LIN/cerrhe.f b/lapack-netlib/TESTING/LIN/cerrhe.f index d0c5cf6b5..1798aed57 100644 --- a/lapack-netlib/TESTING/LIN/cerrhe.f +++ b/lapack-netlib/TESTING/LIN/cerrhe.f @@ -133,7 +133,7 @@ IF( LSAMEN( 2, C2, 'HE' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite matrix with patrial +* of a Hermitian indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CHETRF @@ -576,7 +576,7 @@ CALL CHKXER( 'CHETRS_AA_STAGE', INFOT, NOUT, LERR, OK ) * * Test error exits of the routines that use factorization -* of a Hermitian indefinite packed matrix with patrial +* of a Hermitian indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/cerrhex.f b/lapack-netlib/TESTING/LIN/cerrhex.f index b6c889798..83b3a92ad 100644 --- a/lapack-netlib/TESTING/LIN/cerrhex.f +++ b/lapack-netlib/TESTING/LIN/cerrhex.f @@ -137,7 +137,7 @@ IF( LSAMEN( 2, C2, 'HE' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite matrix with patrial +* of a Hermitian indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CHETRF @@ -523,7 +523,7 @@ ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite packed matrix with patrial +* of a Hermitian indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CHPTRF diff --git a/lapack-netlib/TESTING/LIN/cerrsy.f b/lapack-netlib/TESTING/LIN/cerrsy.f index a236d2d18..6d284be98 100644 --- a/lapack-netlib/TESTING/LIN/cerrsy.f +++ b/lapack-netlib/TESTING/LIN/cerrsy.f @@ -130,7 +130,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CSYTRF @@ -469,7 +469,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CSPTRF diff --git a/lapack-netlib/TESTING/LIN/cerrsyx.f b/lapack-netlib/TESTING/LIN/cerrsyx.f index 34972668e..c8d269c84 100644 --- a/lapack-netlib/TESTING/LIN/cerrsyx.f +++ b/lapack-netlib/TESTING/LIN/cerrsyx.f @@ -135,7 +135,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CSYTRF @@ -521,7 +521,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * CSPTRF diff --git a/lapack-netlib/TESTING/LIN/cgtt01.f b/lapack-netlib/TESTING/LIN/cgtt01.f index e504515c5..64dc5f8fa 100644 --- a/lapack-netlib/TESTING/LIN/cgtt01.f +++ b/lapack-netlib/TESTING/LIN/cgtt01.f @@ -39,7 +39,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/cgtt02.f b/lapack-netlib/TESTING/LIN/cgtt02.f index 702e66eed..8b951acd5 100644 --- a/lapack-netlib/TESTING/LIN/cgtt02.f +++ b/lapack-netlib/TESTING/LIN/cgtt02.f @@ -40,14 +40,14 @@ *> \verbatim *> TRANS is CHARACTER *> Specifies the form of the residual. -*> = 'N': B - A * X (No transpose) +*> = 'N': B - A * X (No transpose) *> = 'T': B - A**T * X (Transpose) *> = 'C': B - A**H * X (Conjugate transpose) *> \endverbatim *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/chet01_3.f b/lapack-netlib/TESTING/LIN/chet01_3.f index 58104a357..7e07b5790 100644 --- a/lapack-netlib/TESTING/LIN/chet01_3.f +++ b/lapack-netlib/TESTING/LIN/chet01_3.f @@ -188,7 +188,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL CSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/clqt02.f b/lapack-netlib/TESTING/LIN/clqt02.f index 24eb05399..20297f6cc 100644 --- a/lapack-netlib/TESTING/LIN/clqt02.f +++ b/lapack-netlib/TESTING/LIN/clqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> CLQT02 tests CUNGLQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the LQ factorization of an m-by-n matrix A, CLQT02 generates diff --git a/lapack-netlib/TESTING/LIN/cptt01.f b/lapack-netlib/TESTING/LIN/cptt01.f index e4520ec3d..3b117ad28 100644 --- a/lapack-netlib/TESTING/LIN/cptt01.f +++ b/lapack-netlib/TESTING/LIN/cptt01.f @@ -36,7 +36,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/cptt02.f b/lapack-netlib/TESTING/LIN/cptt02.f index da4f0e854..ffaef89bd 100644 --- a/lapack-netlib/TESTING/LIN/cptt02.f +++ b/lapack-netlib/TESTING/LIN/cptt02.f @@ -46,7 +46,7 @@ *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/cqlt02.f b/lapack-netlib/TESTING/LIN/cqlt02.f index fc4685aa5..53080cd17 100644 --- a/lapack-netlib/TESTING/LIN/cqlt02.f +++ b/lapack-netlib/TESTING/LIN/cqlt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> CQLT02 tests CUNGQL, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QL factorization of an m-by-n matrix A, CQLT02 generates diff --git a/lapack-netlib/TESTING/LIN/cqrt02.f b/lapack-netlib/TESTING/LIN/cqrt02.f index 62f176cd8..db22496cf 100644 --- a/lapack-netlib/TESTING/LIN/cqrt02.f +++ b/lapack-netlib/TESTING/LIN/cqrt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> CQRT02 tests CUNGQR, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QR factorization of an m-by-n matrix A, CQRT02 generates diff --git a/lapack-netlib/TESTING/LIN/crqt02.f b/lapack-netlib/TESTING/LIN/crqt02.f index 8625c2f91..81c805dc4 100644 --- a/lapack-netlib/TESTING/LIN/crqt02.f +++ b/lapack-netlib/TESTING/LIN/crqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> CRQT02 tests CUNGRQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the RQ factorization of an m-by-n matrix A, CRQT02 generates diff --git a/lapack-netlib/TESTING/LIN/csyt01_3.f b/lapack-netlib/TESTING/LIN/csyt01_3.f index f0b316cef..879749a10 100644 --- a/lapack-netlib/TESTING/LIN/csyt01_3.f +++ b/lapack-netlib/TESTING/LIN/csyt01_3.f @@ -188,7 +188,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL CSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/dchktp.f b/lapack-netlib/TESTING/LIN/dchktp.f index 9af6150ca..6db5b1376 100644 --- a/lapack-netlib/TESTING/LIN/dchktp.f +++ b/lapack-netlib/TESTING/LIN/dchktp.f @@ -86,7 +86,7 @@ *> \verbatim *> NMAX is INTEGER *> The leading dimension of the work arrays. NMAX >= the -*> maximumm value of N in NVAL. +*> maximum value of N in NVAL. *> \endverbatim *> *> \param[out] AP diff --git a/lapack-netlib/TESTING/LIN/ddrvab.f b/lapack-netlib/TESTING/LIN/ddrvab.f index 9110d8334..5fd32b951 100644 --- a/lapack-netlib/TESTING/LIN/ddrvab.f +++ b/lapack-netlib/TESTING/LIN/ddrvab.f @@ -346,7 +346,7 @@ CALL DGET08( TRANS, N, N, NRHS, A, LDA, X, LDA, WORK, $ LDA, RWORK, RESULT( 1 ) ) * -* Check if the test passes the tesing. +* Check if the test passes the testing. * Print information about the tests that did not * pass the testing. * diff --git a/lapack-netlib/TESTING/LIN/ddrvac.f b/lapack-netlib/TESTING/LIN/ddrvac.f index bd463cee4..3ecbc6a23 100644 --- a/lapack-netlib/TESTING/LIN/ddrvac.f +++ b/lapack-netlib/TESTING/LIN/ddrvac.f @@ -365,7 +365,7 @@ CALL DPOT06( UPLO, N, NRHS, A, LDA, X, LDA, WORK, $ LDA, RWORK, RESULT( 1 ) ) * -* Check if the test passes the tesing. +* Check if the test passes the testing. * Print information about the tests that did not * pass the testing. * diff --git a/lapack-netlib/TESTING/LIN/derrsy.f b/lapack-netlib/TESTING/LIN/derrsy.f index af15a4b8e..eb08d4c7b 100644 --- a/lapack-netlib/TESTING/LIN/derrsy.f +++ b/lapack-netlib/TESTING/LIN/derrsy.f @@ -133,7 +133,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) pivoting. * * DSYTRF @@ -581,7 +581,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * DSPTRF diff --git a/lapack-netlib/TESTING/LIN/derrsyx.f b/lapack-netlib/TESTING/LIN/derrsyx.f index c2d14caab..495302158 100644 --- a/lapack-netlib/TESTING/LIN/derrsyx.f +++ b/lapack-netlib/TESTING/LIN/derrsyx.f @@ -138,7 +138,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) pivoting. * * DSYTRF @@ -528,7 +528,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * DSPTRF diff --git a/lapack-netlib/TESTING/LIN/dgtt01.f b/lapack-netlib/TESTING/LIN/dgtt01.f index 3b5ff9e4c..6fd754db2 100644 --- a/lapack-netlib/TESTING/LIN/dgtt01.f +++ b/lapack-netlib/TESTING/LIN/dgtt01.f @@ -39,7 +39,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/dgtt02.f b/lapack-netlib/TESTING/LIN/dgtt02.f index b3268b138..4fecff86e 100644 --- a/lapack-netlib/TESTING/LIN/dgtt02.f +++ b/lapack-netlib/TESTING/LIN/dgtt02.f @@ -41,14 +41,14 @@ *> \verbatim *> TRANS is CHARACTER *> Specifies the form of the residual. -*> = 'N': B - A * X (No transpose) +*> = 'N': B - A * X (No transpose) *> = 'T': B - A**T * X (Transpose) *> = 'C': B - A**H * X (Conjugate transpose = Transpose) *> \endverbatim *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/dlqt02.f b/lapack-netlib/TESTING/LIN/dlqt02.f index 158cd0b79..672d82bbc 100644 --- a/lapack-netlib/TESTING/LIN/dlqt02.f +++ b/lapack-netlib/TESTING/LIN/dlqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> DLQT02 tests DORGLQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the LQ factorization of an m-by-n matrix A, DLQT02 generates diff --git a/lapack-netlib/TESTING/LIN/dptt01.f b/lapack-netlib/TESTING/LIN/dptt01.f index 2c6c440ff..9e9c6bd85 100644 --- a/lapack-netlib/TESTING/LIN/dptt01.f +++ b/lapack-netlib/TESTING/LIN/dptt01.f @@ -35,7 +35,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/dptt02.f b/lapack-netlib/TESTING/LIN/dptt02.f index a4802c696..e045b8779 100644 --- a/lapack-netlib/TESTING/LIN/dptt02.f +++ b/lapack-netlib/TESTING/LIN/dptt02.f @@ -35,7 +35,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/dqlt02.f b/lapack-netlib/TESTING/LIN/dqlt02.f index 950cfe67b..7799e0858 100644 --- a/lapack-netlib/TESTING/LIN/dqlt02.f +++ b/lapack-netlib/TESTING/LIN/dqlt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> DQLT02 tests DORGQL, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QL factorization of an m-by-n matrix A, DQLT02 generates diff --git a/lapack-netlib/TESTING/LIN/dqrt02.f b/lapack-netlib/TESTING/LIN/dqrt02.f index d0e4349d1..d41c7b324 100644 --- a/lapack-netlib/TESTING/LIN/dqrt02.f +++ b/lapack-netlib/TESTING/LIN/dqrt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> DQRT02 tests DORGQR, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QR factorization of an m-by-n matrix A, DQRT02 generates diff --git a/lapack-netlib/TESTING/LIN/drqt02.f b/lapack-netlib/TESTING/LIN/drqt02.f index 7fbb6a6d4..faf639109 100644 --- a/lapack-netlib/TESTING/LIN/drqt02.f +++ b/lapack-netlib/TESTING/LIN/drqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> DRQT02 tests DORGRQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the RQ factorization of an m-by-n matrix A, DRQT02 generates diff --git a/lapack-netlib/TESTING/LIN/dsyt01_3.f b/lapack-netlib/TESTING/LIN/dsyt01_3.f index 2a7d9d142..060a9caed 100644 --- a/lapack-netlib/TESTING/LIN/dsyt01_3.f +++ b/lapack-netlib/TESTING/LIN/dsyt01_3.f @@ -183,7 +183,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL DSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/schktp.f b/lapack-netlib/TESTING/LIN/schktp.f index ff05c1d97..a5243f651 100644 --- a/lapack-netlib/TESTING/LIN/schktp.f +++ b/lapack-netlib/TESTING/LIN/schktp.f @@ -86,7 +86,7 @@ *> \verbatim *> NMAX is INTEGER *> The leading dimension of the work arrays. NMAX >= the -*> maximumm value of N in NVAL. +*> maximum value of N in NVAL. *> \endverbatim *> *> \param[out] AP diff --git a/lapack-netlib/TESTING/LIN/serrsy.f b/lapack-netlib/TESTING/LIN/serrsy.f index c562b417b..ab422ba5c 100644 --- a/lapack-netlib/TESTING/LIN/serrsy.f +++ b/lapack-netlib/TESTING/LIN/serrsy.f @@ -133,7 +133,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) pivoting. * * SSYTRF @@ -581,7 +581,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * SSPTRF diff --git a/lapack-netlib/TESTING/LIN/serrsyx.f b/lapack-netlib/TESTING/LIN/serrsyx.f index ed47c37bd..cffd28a52 100644 --- a/lapack-netlib/TESTING/LIN/serrsyx.f +++ b/lapack-netlib/TESTING/LIN/serrsyx.f @@ -137,7 +137,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) pivoting. * * SSYTRF @@ -527,7 +527,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * SSPTRF diff --git a/lapack-netlib/TESTING/LIN/sgtt01.f b/lapack-netlib/TESTING/LIN/sgtt01.f index 5d88c91f7..5d639af99 100644 --- a/lapack-netlib/TESTING/LIN/sgtt01.f +++ b/lapack-netlib/TESTING/LIN/sgtt01.f @@ -39,7 +39,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/sgtt02.f b/lapack-netlib/TESTING/LIN/sgtt02.f index a4eb09f6b..6daea295f 100644 --- a/lapack-netlib/TESTING/LIN/sgtt02.f +++ b/lapack-netlib/TESTING/LIN/sgtt02.f @@ -41,14 +41,14 @@ *> \verbatim *> TRANS is CHARACTER *> Specifies the form of the residual. -*> = 'N': B - A * X (No transpose) +*> = 'N': B - A * X (No transpose) *> = 'T': B - A**T * X (Transpose) *> = 'C': B - A**H * X (Conjugate transpose = Transpose) *> \endverbatim *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/slqt02.f b/lapack-netlib/TESTING/LIN/slqt02.f index d59a8a01c..1c7bd4f0d 100644 --- a/lapack-netlib/TESTING/LIN/slqt02.f +++ b/lapack-netlib/TESTING/LIN/slqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> SLQT02 tests SORGLQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the LQ factorization of an m-by-n matrix A, SLQT02 generates diff --git a/lapack-netlib/TESTING/LIN/sptt01.f b/lapack-netlib/TESTING/LIN/sptt01.f index fef18dd2b..5b33bfad5 100644 --- a/lapack-netlib/TESTING/LIN/sptt01.f +++ b/lapack-netlib/TESTING/LIN/sptt01.f @@ -35,7 +35,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/sptt02.f b/lapack-netlib/TESTING/LIN/sptt02.f index 51a6a074a..2bd5535aa 100644 --- a/lapack-netlib/TESTING/LIN/sptt02.f +++ b/lapack-netlib/TESTING/LIN/sptt02.f @@ -35,7 +35,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/sqlt02.f b/lapack-netlib/TESTING/LIN/sqlt02.f index 5d381b950..f26352eb3 100644 --- a/lapack-netlib/TESTING/LIN/sqlt02.f +++ b/lapack-netlib/TESTING/LIN/sqlt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> SQLT02 tests SORGQL, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QL factorization of an m-by-n matrix A, SQLT02 generates diff --git a/lapack-netlib/TESTING/LIN/sqrt02.f b/lapack-netlib/TESTING/LIN/sqrt02.f index 72163f0a9..44b9c6270 100644 --- a/lapack-netlib/TESTING/LIN/sqrt02.f +++ b/lapack-netlib/TESTING/LIN/sqrt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> SQRT02 tests SORGQR, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QR factorization of an m-by-n matrix A, SQRT02 generates diff --git a/lapack-netlib/TESTING/LIN/srqt02.f b/lapack-netlib/TESTING/LIN/srqt02.f index ca0594f7a..a33c98ba4 100644 --- a/lapack-netlib/TESTING/LIN/srqt02.f +++ b/lapack-netlib/TESTING/LIN/srqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> SRQT02 tests SORGRQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the RQ factorization of an m-by-n matrix A, SRQT02 generates diff --git a/lapack-netlib/TESTING/LIN/ssyt01_3.f b/lapack-netlib/TESTING/LIN/ssyt01_3.f index 147553db9..951fcb7d6 100644 --- a/lapack-netlib/TESTING/LIN/ssyt01_3.f +++ b/lapack-netlib/TESTING/LIN/ssyt01_3.f @@ -183,7 +183,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL SSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/zchktp.f b/lapack-netlib/TESTING/LIN/zchktp.f index 1798c24e7..ab46f5ce5 100644 --- a/lapack-netlib/TESTING/LIN/zchktp.f +++ b/lapack-netlib/TESTING/LIN/zchktp.f @@ -87,7 +87,7 @@ *> \verbatim *> NMAX is INTEGER *> The leading dimension of the work arrays. NMAX >= the -*> maximumm value of N in NVAL. +*> maximum value of N in NVAL. *> \endverbatim *> *> \param[out] AP diff --git a/lapack-netlib/TESTING/LIN/zdrvab.f b/lapack-netlib/TESTING/LIN/zdrvab.f index 130515959..772eb08af 100644 --- a/lapack-netlib/TESTING/LIN/zdrvab.f +++ b/lapack-netlib/TESTING/LIN/zdrvab.f @@ -348,7 +348,7 @@ CALL ZGET08( TRANS, N, N, NRHS, A, LDA, X, LDA, WORK, $ LDA, RWORK, RESULT( 1 ) ) * -* Check if the test passes the tesing. +* Check if the test passes the testing. * Print information about the tests that did not * pass the testing. * diff --git a/lapack-netlib/TESTING/LIN/zdrvac.f b/lapack-netlib/TESTING/LIN/zdrvac.f index 20f8eb1e5..bbf73a263 100644 --- a/lapack-netlib/TESTING/LIN/zdrvac.f +++ b/lapack-netlib/TESTING/LIN/zdrvac.f @@ -367,7 +367,7 @@ CALL ZPOT06( UPLO, N, NRHS, A, LDA, X, LDA, WORK, $ LDA, RWORK, RESULT( 1 ) ) * -* Check if the test passes the tesing. +* Check if the test passes the testing. * Print information about the tests that did not * pass the testing. * diff --git a/lapack-netlib/TESTING/LIN/zerrhe.f b/lapack-netlib/TESTING/LIN/zerrhe.f index e49e5037c..40dd25d75 100644 --- a/lapack-netlib/TESTING/LIN/zerrhe.f +++ b/lapack-netlib/TESTING/LIN/zerrhe.f @@ -135,7 +135,7 @@ IF( LSAMEN( 2, C2, 'HE' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite matrix with patrial +* of a Hermitian indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * ZHETRF @@ -580,7 +580,7 @@ ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite packed matrix with patrial +* of a Hermitian indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * ZHPTRF diff --git a/lapack-netlib/TESTING/LIN/zerrhex.f b/lapack-netlib/TESTING/LIN/zerrhex.f index b6e8b77ef..bdcdefff7 100644 --- a/lapack-netlib/TESTING/LIN/zerrhex.f +++ b/lapack-netlib/TESTING/LIN/zerrhex.f @@ -138,7 +138,7 @@ OK = .TRUE. * * Test error exits of the routines that use factorization -* of a Hermitian indefinite matrix with patrial +* of a Hermitian indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * IF( LSAMEN( 2, C2, 'HE' ) ) THEN @@ -526,7 +526,7 @@ ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN * * Test error exits of the routines that use factorization -* of a Hermitian indefinite packed matrix with patrial +* of a Hermitian indefinite packed matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * ZHPTRF diff --git a/lapack-netlib/TESTING/LIN/zerrsy.f b/lapack-netlib/TESTING/LIN/zerrsy.f index a50f9d24e..932df1936 100644 --- a/lapack-netlib/TESTING/LIN/zerrsy.f +++ b/lapack-netlib/TESTING/LIN/zerrsy.f @@ -132,7 +132,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * ZSYTRF @@ -471,7 +471,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * ZSPTRF diff --git a/lapack-netlib/TESTING/LIN/zerrsyx.f b/lapack-netlib/TESTING/LIN/zerrsyx.f index 23d2a5a2b..9d5e71288 100644 --- a/lapack-netlib/TESTING/LIN/zerrsyx.f +++ b/lapack-netlib/TESTING/LIN/zerrsyx.f @@ -139,7 +139,7 @@ IF( LSAMEN( 2, C2, 'SY' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite matrix with patrial +* of a symmetric indefinite matrix with partial * (Bunch-Kaufman) diagonal pivoting method. * * ZSYTRF @@ -525,7 +525,7 @@ ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN * * Test error exits of the routines that use factorization -* of a symmetric indefinite packed matrix with patrial +* of a symmetric indefinite packed matrix with partial * (Bunch-Kaufman) pivoting. * * ZSPTRF diff --git a/lapack-netlib/TESTING/LIN/zgtt01.f b/lapack-netlib/TESTING/LIN/zgtt01.f index c63fdbc25..a8ac13d2d 100644 --- a/lapack-netlib/TESTING/LIN/zgtt01.f +++ b/lapack-netlib/TESTING/LIN/zgtt01.f @@ -39,7 +39,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/zgtt02.f b/lapack-netlib/TESTING/LIN/zgtt02.f index 7362967be..f86fe9244 100644 --- a/lapack-netlib/TESTING/LIN/zgtt02.f +++ b/lapack-netlib/TESTING/LIN/zgtt02.f @@ -40,14 +40,14 @@ *> \verbatim *> TRANS is CHARACTER *> Specifies the form of the residual. -*> = 'N': B - A * X (No transpose) +*> = 'N': B - A * X (No transpose) *> = 'T': B - A**T * X (Transpose) *> = 'C': B - A**H * X (Conjugate transpose) *> \endverbatim *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. N >= 0. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/zhet01_3.f b/lapack-netlib/TESTING/LIN/zhet01_3.f index 0a76404d6..5beed9042 100644 --- a/lapack-netlib/TESTING/LIN/zhet01_3.f +++ b/lapack-netlib/TESTING/LIN/zhet01_3.f @@ -188,7 +188,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL ZSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/zlqt02.f b/lapack-netlib/TESTING/LIN/zlqt02.f index c55d76ccd..9ba98ff1f 100644 --- a/lapack-netlib/TESTING/LIN/zlqt02.f +++ b/lapack-netlib/TESTING/LIN/zlqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> ZLQT02 tests ZUNGLQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the LQ factorization of an m-by-n matrix A, ZLQT02 generates diff --git a/lapack-netlib/TESTING/LIN/zptt01.f b/lapack-netlib/TESTING/LIN/zptt01.f index e842c7e46..9f2359c2a 100644 --- a/lapack-netlib/TESTING/LIN/zptt01.f +++ b/lapack-netlib/TESTING/LIN/zptt01.f @@ -36,7 +36,7 @@ * *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/zptt02.f b/lapack-netlib/TESTING/LIN/zptt02.f index 6e3a8aed0..8a1e2961c 100644 --- a/lapack-netlib/TESTING/LIN/zptt02.f +++ b/lapack-netlib/TESTING/LIN/zptt02.f @@ -46,7 +46,7 @@ *> *> \param[in] N *> \verbatim -*> N is INTEGTER +*> N is INTEGER *> The order of the matrix A. *> \endverbatim *> diff --git a/lapack-netlib/TESTING/LIN/zqlt02.f b/lapack-netlib/TESTING/LIN/zqlt02.f index 1f84cfa5a..7cb889931 100644 --- a/lapack-netlib/TESTING/LIN/zqlt02.f +++ b/lapack-netlib/TESTING/LIN/zqlt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> ZQLT02 tests ZUNGQL, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QL factorization of an m-by-n matrix A, ZQLT02 generates diff --git a/lapack-netlib/TESTING/LIN/zqrt02.f b/lapack-netlib/TESTING/LIN/zqrt02.f index 2dbefaf84..a32703c49 100644 --- a/lapack-netlib/TESTING/LIN/zqrt02.f +++ b/lapack-netlib/TESTING/LIN/zqrt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> ZQRT02 tests ZUNGQR, which generates an m-by-n matrix Q with -*> orthonornmal columns that is defined as the product of k elementary +*> orthonormal columns that is defined as the product of k elementary *> reflectors. *> *> Given the QR factorization of an m-by-n matrix A, ZQRT02 generates diff --git a/lapack-netlib/TESTING/LIN/zrqt02.f b/lapack-netlib/TESTING/LIN/zrqt02.f index 548321d00..c44e04cb6 100644 --- a/lapack-netlib/TESTING/LIN/zrqt02.f +++ b/lapack-netlib/TESTING/LIN/zrqt02.f @@ -27,7 +27,7 @@ *> \verbatim *> *> ZRQT02 tests ZUNGRQ, which generates an m-by-n matrix Q with -*> orthonornmal rows that is defined as the product of k elementary +*> orthonormal rows that is defined as the product of k elementary *> reflectors. *> *> Given the RQ factorization of an m-by-n matrix A, ZRQT02 generates diff --git a/lapack-netlib/TESTING/LIN/zsyt01_3.f b/lapack-netlib/TESTING/LIN/zsyt01_3.f index 202488db9..a572353f2 100644 --- a/lapack-netlib/TESTING/LIN/zsyt01_3.f +++ b/lapack-netlib/TESTING/LIN/zsyt01_3.f @@ -188,7 +188,7 @@ RETURN END IF * -* a) Revert to multiplyers of L +* a) Revert to multipliers of L * CALL ZSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO ) * From 0c38ebd5990f7c44529cabbbadf7806eb6afa0c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 May 2023 17:31:55 +0200 Subject: [PATCH 078/718] Extend tests for error exit sysv/sytd2/gehd2 (Reference-LAPACK PR 795) --- lapack-netlib/TESTING/EIG/cerrhs.f | 70 +++++++++++++++++++++++++++-- lapack-netlib/TESTING/EIG/cerrst.f | 18 +++++++- lapack-netlib/TESTING/EIG/derrhs.f | 72 ++++++++++++++++++++++++++++-- lapack-netlib/TESTING/EIG/derrst.f | 24 +++++++--- lapack-netlib/TESTING/EIG/serrhs.f | 72 ++++++++++++++++++++++++++++-- lapack-netlib/TESTING/EIG/serrst.f | 18 +++++++- lapack-netlib/TESTING/EIG/zerrhs.f | 70 +++++++++++++++++++++++++++-- lapack-netlib/TESTING/EIG/zerrst.f | 18 +++++++- 8 files changed, 337 insertions(+), 25 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cerrhs.f b/lapack-netlib/TESTING/EIG/cerrhs.f index 0568a6d78..2dd86b8c9 100644 --- a/lapack-netlib/TESTING/EIG/cerrhs.f +++ b/lapack-netlib/TESTING/EIG/cerrhs.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> CERRHS tests the error exits for CGEBAK, CGEBAL, CGEHRD, CUNGHR, -*> CUNMHR, CHSEQR, CHSEIN, and CTREVC. +*> CERRHS tests the error exits for CGEBAK, CGEBAL, CGEHRD, CGEHD2, +*> CUNGHR, CUNMHR, CHSEQR, CHSEIN, CTREVC, and CTREVC3. *> \endverbatim * * Arguments: @@ -86,7 +86,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, CGEBAK, CGEBAL, CGEHRD, CHSEIN, CHSEQR, - $ CUNGHR, CUNMHR, CTREVC + $ CUNGHR, CUNMHR, CTREVC, CTREVC3, CGEHD2 * .. * .. Intrinsic Functions .. INTRINSIC REAL @@ -193,6 +193,29 @@ CALL CHKXER( 'CGEHRD', INFOT, NOUT, LERR, OK ) NT = NT + 7 * +* CGEHD2 +* + SRNAMT = 'CGEHD2' + INFOT = 1 + CALL CGEHD2( -1, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEHD2( 0, 0, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEHD2( 0, 2, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEHD2( 1, 1, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEHD2( 0, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEHD2( 2, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK ) + NT = NT + 6 +* * CUNGHR * SRNAMT = 'CUNGHR' @@ -398,6 +421,47 @@ $ RW, INFO ) CALL CHKXER( 'CTREVC', INFOT, NOUT, LERR, OK ) NT = NT + 7 +* +* CTREVC3 +* + SRNAMT = 'CTREVC3' + INFOT = 1 + CALL CTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ 2, RW, 2, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 16 + CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK ) + NT = NT + 9 END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f index 18b5cad36..ba97afbe5 100644 --- a/lapack-netlib/TESTING/EIG/cerrst.f +++ b/lapack-netlib/TESTING/EIG/cerrst.f @@ -21,7 +21,7 @@ *> *> \verbatim *> -*> CERRST tests the error exits for CHETRD, CUNGTR, CUNMTR, CHPTRD, +*> CERRST tests the error exits for CHETRD, CHETD2, CUNGTR, CUNMTR, CHPTRD, *> CUNGTR, CUPMTR, CSTEQR, CSTEIN, CPTEQR, CHBTRD, *> CHEEV, CHEEVX, CHEEVD, CHBEV, CHBEVX, CHBEVD, *> CHPEV, CHPEVX, CHPEVD, and CSTEDC. @@ -94,7 +94,7 @@ EXTERNAL CHBEV, CHBEVD, CHBEVX, CHBTRD, CHEEV, CHEEVD, $ CHEEVR, CHEEVX, CHETRD, CHKXER, CHPEV, CHPEVD, $ CHPEVX, CHPTRD, CPTEQR, CSTEDC, CSTEIN, CSTEQR, - $ CUNGTR, CUNMTR, CUPGTR, CUPMTR, + $ CUNGTR, CUNMTR, CUPGTR, CUPMTR, CHETD2, $ CHEEVD_2STAGE, CHEEVR_2STAGE, CHEEVX_2STAGE, $ CHEEV_2STAGE, CHBEV_2STAGE, CHBEVD_2STAGE, $ CHBEVX_2STAGE, CHETRD_2STAGE, CHETRD_HE2HB, @@ -156,6 +156,20 @@ CALL CHKXER( 'CHETRD', INFOT, NOUT, LERR, OK ) NT = NT + 4 * +* CHETD2 +* + SRNAMT = 'CHETD2' + INFOT = 1 + CALL CHETD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHETD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHETD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) + NT = NT + 3 +* * CHETRD_2STAGE * SRNAMT = 'CHETRD_2STAGE' diff --git a/lapack-netlib/TESTING/EIG/derrhs.f b/lapack-netlib/TESTING/EIG/derrhs.f index fec41c0f0..583bebc9b 100644 --- a/lapack-netlib/TESTING/EIG/derrhs.f +++ b/lapack-netlib/TESTING/EIG/derrhs.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> DERRHS tests the error exits for DGEBAK, SGEBAL, SGEHRD, DORGHR, -*> DORMHR, DHSEQR, SHSEIN, and DTREVC. +*> DERRHS tests the error exits for DGEBAK, DGEBAL, DGEHRD, DGEHD2, +*> DORGHR, DORMHR, DHSEQR, DHSEIN, DTREVC, and DTREVC3. *> \endverbatim * * Arguments: @@ -86,7 +86,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, DGEBAK, DGEBAL, DGEHRD, DHSEIN, DHSEQR, - $ DORGHR, DORMHR, DTREVC + $ DORGHR, DORMHR, DTREVC, DTREVC3, DGEHD2 * .. * .. Intrinsic Functions .. INTRINSIC DBLE @@ -194,6 +194,29 @@ CALL CHKXER( 'DGEHRD', INFOT, NOUT, LERR, OK ) NT = NT + 7 * +* DGEHD2 +* + SRNAMT = 'DGEHD2' + INFOT = 1 + CALL DGEHD2( -1, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGEHD2( 0, 0, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGEHD2( 0, 2, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEHD2( 1, 1, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEHD2( 0, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGEHD2( 2, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK ) + NT = NT + 6 +* * DORGHR * SRNAMT = 'DORGHR' @@ -328,7 +351,11 @@ CALL DHSEQR( 'E', 'V', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1, $ INFO ) CALL CHKXER( 'DHSEQR', INFOT, NOUT, LERR, OK ) - NT = NT + 9 + INFOT = 13 + CALL DHSEQR( 'E', 'N', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1, + $ INFO ) + CALL CHKXER( 'DHSEQR', INFOT, NOUT, LERR, OK ) + NT = NT + 10 * * DHSEIN * @@ -399,6 +426,43 @@ $ INFO ) CALL CHKXER( 'DTREVC', INFOT, NOUT, LERR, OK ) NT = NT + 7 +* +* DTREVC3 +* + SRNAMT = 'DTREVC3' + INFOT = 1 + CALL DTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W, + $ LW, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ 2, INFO ) + CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f index f297e5a7d..a55b6eea9 100644 --- a/lapack-netlib/TESTING/EIG/derrst.f +++ b/lapack-netlib/TESTING/EIG/derrst.f @@ -21,10 +21,10 @@ *> *> \verbatim *> -*> DERRST tests the error exits for DSYTRD, DORGTR, DORMTR, DSPTRD, -*> DOPGTR, DOPMTR, DSTEQR, SSTERF, SSTEBZ, SSTEIN, DPTEQR, DSBTRD, -*> DSYEV, SSYEVX, SSYEVD, DSBEV, SSBEVX, SSBEVD, -*> DSPEV, SSPEVX, SSPEVD, DSTEV, SSTEVX, SSTEVD, and SSTEDC. +*> DERRST tests the error exits for DSYTRD, DSYTD2, DORGTR, DORMTR, DSPTRD, +*> DOPGTR, DOPMTR, DSTEQR, DSTERF, DSTEBZ, DSTEIN, DPTEQR, DSBTRD, +*> DSYEV, DSYEVX, DSYEVD, DSBEV, DSBEVX, DSBEVD, +*> DSPEV, DSPEVX, DSPEVD, DSTEV, DSTEVX, DSTEVD, and DSTEDC. *> DSYEVD_2STAGE, DSYEVR_2STAGE, DSYEVX_2STAGE, *> DSYEV_2STAGE, DSBEV_2STAGE, DSBEVD_2STAGE, *> DSBEVX_2STAGE, DSYTRD_2STAGE, DSYTRD_SY2SB, @@ -95,7 +95,7 @@ $ DSBEV, DSBEVD, DSBEVX, DSBTRD, DSPEV, DSPEVD, $ DSPEVX, DSPTRD, DSTEBZ, DSTEDC, DSTEIN, DSTEQR, $ DSTERF, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSYEV, - $ DSYEVD, DSYEVR, DSYEVX, DSYTRD, + $ DSYEVD, DSYEVR, DSYEVX, DSYTRD, DSYTD2, $ DSYEVD_2STAGE, DSYEVR_2STAGE, DSYEVX_2STAGE, $ DSYEV_2STAGE, DSBEV_2STAGE, DSBEVD_2STAGE, $ DSBEVX_2STAGE, DSYTRD_2STAGE, DSYTRD_SY2SB, @@ -157,6 +157,20 @@ CALL CHKXER( 'DSYTRD', INFOT, NOUT, LERR, OK ) NT = NT + 4 * +* DSYTD2 +* + SRNAMT = 'DSYTD2' + INFOT = 1 + CALL DSYTD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) + NT = NT + 3 +* * DSYTRD_2STAGE * SRNAMT = 'DSYTRD_2STAGE' diff --git a/lapack-netlib/TESTING/EIG/serrhs.f b/lapack-netlib/TESTING/EIG/serrhs.f index 8f0ff98a1..89b7303cd 100644 --- a/lapack-netlib/TESTING/EIG/serrhs.f +++ b/lapack-netlib/TESTING/EIG/serrhs.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> SERRHS tests the error exits for SGEBAK, SGEBAL, SGEHRD, SORGHR, -*> SORMHR, SHSEQR, SHSEIN, and STREVC. +*> SERRHS tests the error exits for SGEBAK, SGEBAL, SGEHRD, SGEHD2, +*> SORGHR, SORMHR, SHSEQR, SHSEIN, STREVC, and STREVC3. *> \endverbatim * * Arguments: @@ -85,7 +85,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, SGEBAK, SGEBAL, SGEHRD, SHSEIN, SHSEQR, - $ SORGHR, SORMHR, STREVC + $ SORGHR, SORMHR, STREVC, STREVC3, SGEHD2 * .. * .. Intrinsic Functions .. INTRINSIC REAL @@ -193,6 +193,29 @@ CALL CHKXER( 'SGEHRD', INFOT, NOUT, LERR, OK ) NT = NT + 7 * +* SGEHD2 +* + SRNAMT = 'SGEHD2' + INFOT = 1 + CALL SGEHD2( -1, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGEHD2( 0, 0, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGEHD2( 0, 2, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEHD2( 1, 1, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEHD2( 0, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGEHD2( 2, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK ) + NT = NT + 6 +* * SORGHR * SRNAMT = 'SORGHR' @@ -327,7 +350,11 @@ CALL SHSEQR( 'E', 'V', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1, $ INFO ) CALL CHKXER( 'SHSEQR', INFOT, NOUT, LERR, OK ) - NT = NT + 9 + INFOT = 13 + CALL SHSEQR( 'E', 'N', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1, + $ INFO ) + CALL CHKXER( 'SHSEQR', INFOT, NOUT, LERR, OK ) + NT = NT + 10 * * SHSEIN * @@ -398,6 +425,43 @@ $ INFO ) CALL CHKXER( 'STREVC', INFOT, NOUT, LERR, OK ) NT = NT + 7 +* +* STREVC3 +* + SRNAMT = 'STREVC3' + INFOT = 1 + CALL STREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL STREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W, + $ LW, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ 2, INFO ) + CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f index afb6d4faf..8c9c0f306 100644 --- a/lapack-netlib/TESTING/EIG/serrst.f +++ b/lapack-netlib/TESTING/EIG/serrst.f @@ -21,7 +21,7 @@ *> *> \verbatim *> -*> SERRST tests the error exits for SSYTRD, SORGTR, SORMTR, SSPTRD, +*> SERRST tests the error exits for SSYTRD, SSYTD2, SORGTR, SORMTR, SSPTRD, *> SOPGTR, SOPMTR, SSTEQR, SSTERF, SSTEBZ, SSTEIN, SPTEQR, SSBTRD, *> SSYEV, SSYEVX, SSYEVD, SSBEV, SSBEVX, SSBEVD, *> SSPEV, SSPEVX, SSPEVD, SSTEV, SSTEVX, SSTEVD, and SSTEDC. @@ -95,7 +95,7 @@ $ SSBEV, SSBEVD, SSBEVX, SSBTRD, SSPEV, SSPEVD, $ SSPEVX, SSPTRD, SSTEBZ, SSTEDC, SSTEIN, SSTEQR, $ SSTERF, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSYEV, - $ SSYEVD, SSYEVR, SSYEVX, SSYTRD, + $ SSYEVD, SSYEVR, SSYEVX, SSYTRD, SSYTD2, $ SSYEVD_2STAGE, SSYEVR_2STAGE, SSYEVX_2STAGE, $ SSYEV_2STAGE, SSBEV_2STAGE, SSBEVD_2STAGE, $ SSBEVX_2STAGE, SSYTRD_2STAGE, SSYTRD_SY2SB, @@ -157,6 +157,20 @@ CALL CHKXER( 'SSYTRD', INFOT, NOUT, LERR, OK ) NT = NT + 4 * +* SSYTD2 +* + SRNAMT = 'SSYTD2' + INFOT = 1 + CALL SSYTD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) + NT = NT + 3 +* * SSYTRD_2STAGE * SRNAMT = 'SSYTRD_2STAGE' diff --git a/lapack-netlib/TESTING/EIG/zerrhs.f b/lapack-netlib/TESTING/EIG/zerrhs.f index 582338947..3a70a556e 100644 --- a/lapack-netlib/TESTING/EIG/zerrhs.f +++ b/lapack-netlib/TESTING/EIG/zerrhs.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> ZERRHS tests the error exits for ZGEBAK, CGEBAL, CGEHRD, ZUNGHR, -*> ZUNMHR, ZHSEQR, CHSEIN, and ZTREVC. +*> ZERRHS tests the error exits for ZGEBAK, ZGEBAL, ZGEHRD, ZGEHD2, +*> ZUNGHR, ZUNMHR, ZHSEQR, ZHSEIN, ZTREVC, and ZTREVC3. *> \endverbatim * * Arguments: @@ -86,7 +86,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, ZGEBAK, ZGEBAL, ZGEHRD, ZHSEIN, ZHSEQR, - $ ZTREVC, ZUNGHR, ZUNMHR + $ ZUNGHR, ZUNMHR, ZTREVC, ZTREVC3 * .. * .. Intrinsic Functions .. INTRINSIC DBLE @@ -193,6 +193,29 @@ CALL CHKXER( 'ZGEHRD', INFOT, NOUT, LERR, OK ) NT = NT + 7 * +* ZGEHD2 +* + SRNAMT = 'ZGEHD2' + INFOT = 1 + CALL ZGEHD2( -1, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEHD2( 0, 0, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEHD2( 0, 2, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEHD2( 1, 1, 0, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEHD2( 0, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEHD2( 2, 1, 1, A, 1, TAU, W, INFO ) + CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK ) + NT = NT + 6 +* * ZUNGHR * SRNAMT = 'ZUNGHR' @@ -389,6 +412,47 @@ $ INFO ) CALL CHKXER( 'ZTREVC', INFOT, NOUT, LERR, OK ) NT = NT + 7 +* +* ZTREVC3 +* + SRNAMT = 'ZTREVC3' + INFOT = 1 + CALL ZTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W, + $ LW, RW, 2, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ 2, RW, 2, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + INFOT = 16 + CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W, + $ LW, RW, 1, INFO ) + CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK ) + NT = NT + 9 END IF * * Print a summary line. diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f index 5b0e6f820..948f94bc2 100644 --- a/lapack-netlib/TESTING/EIG/zerrst.f +++ b/lapack-netlib/TESTING/EIG/zerrst.f @@ -23,7 +23,7 @@ *> *> \verbatim *> -*> ZERRST tests the error exits for ZHETRD, ZUNGTR, CUNMTR, ZHPTRD, +*> ZERRST tests the error exits for ZHETRD, ZHETD2, ZUNGTR, CUNMTR, ZHPTRD, *> ZUNGTR, ZUPMTR, ZSTEQR, CSTEIN, ZPTEQR, ZHBTRD, *> ZHEEV, CHEEVX, CHEEVD, ZHBEV, CHBEVX, CHBEVD, *> ZHPEV, CHPEVX, CHPEVD, and ZSTEDC. @@ -95,7 +95,7 @@ EXTERNAL CHKXER, ZHBEV, ZHBEVD, ZHBEVX, ZHBTRD, ZHEEV, $ ZHEEVD, ZHEEVR, ZHEEVX, ZHETRD, ZHPEV, ZHPEVD, $ ZHPEVX, ZHPTRD, ZPTEQR, ZSTEDC, ZSTEIN, ZSTEQR, - $ ZUNGTR, ZUNMTR, ZUPGTR, ZUPMTR, + $ ZUNGTR, ZUNMTR, ZUPGTR, ZUPMTR, ZHETD2, $ ZHEEVD_2STAGE, ZHEEVR_2STAGE, ZHEEVX_2STAGE, $ ZHEEV_2STAGE, ZHBEV_2STAGE, ZHBEVD_2STAGE, $ ZHBEVX_2STAGE, ZHETRD_2STAGE @@ -156,6 +156,20 @@ CALL CHKXER( 'ZHETRD', INFOT, NOUT, LERR, OK ) NT = NT + 4 * +* ZHETD2 +* + SRNAMT = 'ZHETD2' + INFOT = 1 + CALL ZHETD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) + NT = NT + 3 +* * ZHETRD_2STAGE * SRNAMT = 'ZHETRD_2STAGE' From ca1791cfebe72ad5f92b538cc6a2c2a33cee72f1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 May 2023 17:35:00 +0200 Subject: [PATCH 079/718] Extend tests for error exit sysv/sytd2/gehd2 (Reference-LAPACK PR 795) --- lapack-netlib/TESTING/LIN/cerrvx.f | 57 +++++++++++-- lapack-netlib/TESTING/LIN/derrvx.f | 44 ++++++---- lapack-netlib/TESTING/LIN/serrvx.f | 24 +++++- lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f | 2 +- lapack-netlib/TESTING/LIN/zerrvx.f | 87 +++++++++++++++----- 5 files changed, 167 insertions(+), 47 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/cerrvx.f b/lapack-netlib/TESTING/LIN/cerrvx.f index 701abd161..548d09d0c 100644 --- a/lapack-netlib/TESTING/LIN/cerrvx.f +++ b/lapack-netlib/TESTING/LIN/cerrvx.f @@ -87,7 +87,7 @@ * .. * .. External Subroutines .. EXTERNAL CGBSV, CGBSVX, CGESV, CGESVX, CGTSV, CGTSVX, - $ CHESV, CHESV_RK ,CHESV_ROOK, CHESVX, CHKXER, + $ CHESV, CHESV_RK, CHESV_ROOK, CHESVX, CHKXER, $ CHPSV, CHPSVX, CPBSV, CPBSVX, CPOSV, CPOSVX, $ CPPSV, CPPSVX, CPTSV, CPTSVX, CSPSV, CSPSVX, $ CSYSV, CSYSV_AA, CSYSV_RK, CSYSV_ROOK, @@ -651,6 +651,9 @@ INFOT = 3 CALL CHESV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHESV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CHESV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK ) @@ -710,9 +713,15 @@ INFOT = 3 CALL CHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHESV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHESV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN * @@ -733,16 +742,44 @@ CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* CSYSV_AASEN +* + SRNAMT = 'CSYSV_AA' + INFOT = 1 + CALL CSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN * @@ -763,14 +800,18 @@ CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) * @@ -834,6 +875,9 @@ INFOT = 3 CALL CSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK ) @@ -898,6 +942,9 @@ INFOT = 3 CALL CSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL CSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK ) diff --git a/lapack-netlib/TESTING/LIN/derrvx.f b/lapack-netlib/TESTING/LIN/derrvx.f index 447b00bc4..f2d29f7a3 100644 --- a/lapack-netlib/TESTING/LIN/derrvx.f +++ b/lapack-netlib/TESTING/LIN/derrvx.f @@ -699,21 +699,27 @@ * ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN * -* DSYSV_AA -* - SRNAMT = 'DSYSV_AA' - INFOT = 1 - CALL DSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL DSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL DSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL DSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) +* DSYSV_AASEN +* + SRNAMT = 'DSYSV_AA' + INFOT = 1 + CALL DSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN * @@ -734,14 +740,18 @@ CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) * diff --git a/lapack-netlib/TESTING/LIN/serrvx.f b/lapack-netlib/TESTING/LIN/serrvx.f index c3db47332..440f9113e 100644 --- a/lapack-netlib/TESTING/LIN/serrvx.f +++ b/lapack-netlib/TESTING/LIN/serrvx.f @@ -582,6 +582,9 @@ INFOT = 3 CALL SSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK ) @@ -647,6 +650,9 @@ INFOT = 3 CALL SSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK ) @@ -694,7 +700,7 @@ * ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN * -* SSYSV_AA +* SSYSV_AASEN * SRNAMT = 'SSYSV_AA' INFOT = 1 @@ -706,13 +712,19 @@ INFOT = 3 CALL SSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL SSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN * -* DSYSV_AASEN_2STAGE +* SSYSV_AASEN_2STAGE * SRNAMT = 'SSYSV_AA_2STAGE' INFOT = 1 @@ -729,14 +741,18 @@ CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) * diff --git a/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f index 21497477c..5406a76bf 100644 --- a/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f @@ -229,7 +229,7 @@ * Test path * PATH( 1: 1 ) = 'Zomplex precision' - PATH( 2: 3 ) = 'H2' + PATH( 2: 3 ) = 'S2' * * Path to generate matrices * diff --git a/lapack-netlib/TESTING/LIN/zerrvx.f b/lapack-netlib/TESTING/LIN/zerrvx.f index ea7823df3..80b3aaf4a 100644 --- a/lapack-netlib/TESTING/LIN/zerrvx.f +++ b/lapack-netlib/TESTING/LIN/zerrvx.f @@ -653,6 +653,9 @@ INFOT = 3 CALL ZHESV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHESV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZHESV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK ) @@ -700,21 +703,27 @@ * ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN * -* ZHESV_AA -* - SRNAMT = 'ZHESV_AA' - INFOT = 1 - CALL ZHESV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL ZHESV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL ZHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL ZHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) +* ZHESV_AASEN +* + SRNAMT = 'ZHESV_AA' + INFOT = 1 + CALL ZHESV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHESV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHESV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHESV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN * @@ -735,16 +744,44 @@ CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK ) +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* ZSYSV_AASEN +* + SRNAMT = 'ZSYSV_AA' + INFOT = 1 + CALL ZSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO ) + CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN * @@ -765,17 +802,21 @@ CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 5 CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1, + $ W, 1, INFO ) + CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 11 CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1, $ W, 1, INFO ) CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) - INFOT = 7 - CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2, + INFOT = 13 + CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2, $ W, 1, INFO ) CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK ) -** +* ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN * * ZHPSV @@ -836,6 +877,9 @@ INFOT = 3 CALL ZSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK ) @@ -900,6 +944,9 @@ INFOT = 3 CALL ZSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO ) + CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK ) INFOT = 8 CALL ZSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO ) CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK ) From 2bbd61950e504e9ae3b0df2a1c6c20946bc4183d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 May 2023 23:35:57 +0200 Subject: [PATCH 080/718] Add interfaces for [cz]unhr_col and [sd]orhr_col (Reference-LAPACK PR 827) --- lapack-netlib/LAPACKE/include/lapacke.h | 36 ++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 9998b1504..9bd228064 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -12727,7 +12727,41 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i lapack_int lda, lapack_complex_double* tb, lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2, lapack_complex_double* b, lapack_int ldb ); - + +//LAPACK 3.10.0 +lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, float* a, + lapack_int lda, float* t, + lapack_int ldt, float* d ); +lapack_int LAPACKE_sorhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, float* a, + lapack_int lda, float* t, + lapack_int ldt, float* d ); +lapack_int LAPACKE_dorhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, double* a, + lapack_int lda, double* t, + lapack_int ldt, double* d ); +lapack_int LAPACKE_dorhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, double* a, + lapack_int lda, double* t, + lapack_int ldt, double* d ); +lapack_int LAPACKE_cunhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_float* a, + lapack_int lda, lapack_complex_float* t, + lapack_int ldt, lapack_complex_float* d ); +lapack_int LAPACKE_cunhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_float* a, + lapack_int lda, lapack_complex_float* t, + lapack_int ldt, lapack_complex_float* d ); +lapack_int LAPACKE_zunhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_double* a, + lapack_int lda, lapack_complex_double* t, + lapack_int ldt, lapack_complex_double* d ); +lapack_int LAPACKE_zunhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_double* a, + lapack_int lda, lapack_complex_double* t, + lapack_int ldt, lapack_complex_double* d ); + /* APIs for set/get nancheck flags */ void LAPACKE_set_nancheck( int flag ); int LAPACKE_get_nancheck( void ); From bc967e770906a74b6a8de8701b9f22f3815c3d91 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 May 2023 23:41:46 +0200 Subject: [PATCH 081/718] Add interfaces for [cz]unhr_col and [sd]orhr_col (Reference-LAPACK PR 827) --- lapack-netlib/LAPACKE/include/lapack.h | 32 ++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 3e7f9de5b..a0fcaa259 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -11517,6 +11517,22 @@ void LAPACK_sorgtsqr_row( float* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_dorhr_col LAPACK_GLOBAL(dorhr_col,DORHR_COL) +void LAPACK_dorhr_col( + lapack_int const* m, lapack_int const* n, + lapack_int const* nb, double* A, + lapack_int const* lda, double* T, + lapack_int const* ldt, double* D, + lapack_int* info ); + +#define LAPACK_sorhr_col LAPACK_GLOBAL(sorhr_col,SORHR_COL) +void LAPACK_sorhr_col( + lapack_int const* m, lapack_int const* n, + lapack_int const* nb, float* A, + lapack_int const* lda, float* T, + lapack_int const* ldt, float* D, + lapack_int* info ); + #define LAPACK_dormbr_base LAPACK_GLOBAL(dormbr,DORMBR) void LAPACK_dormbr_base( char const* vect, char const* side, char const* trans, @@ -22708,6 +22724,22 @@ void LAPACK_zungtsqr_row( lapack_complex_double* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_cunhr_col LAPACK_GLOBAL(cunhr_col,CUNHR_COL) +void LAPACK_cunhr_col( + lapack_int const* m, lapack_int const* n, + lapack_int const* nb, lapack_complex_float* A, + lapack_int const* lda, lapack_complex_float* T, + lapack_int const* ldt, lapack_complex_float* D, + lapack_int* info ); + +#define LAPACK_zunhr_col LAPACK_GLOBAL(zunhr_col,ZUNHR_COL) +void LAPACK_zunhr_col( + lapack_int const* m, lapack_int const* n, + lapack_int const* nb, lapack_complex_double* A, + lapack_int const* lda, lapack_complex_double* T, + lapack_int const* ldt, lapack_complex_double* D, + lapack_int* info ); + #define LAPACK_cunmbr_base LAPACK_GLOBAL(cunmbr,CUNMBR) void LAPACK_cunmbr_base( char const* vect, char const* side, char const* trans, From 4f10899f5aeb7308fe87998c0f61ee2bd5026807 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 May 2023 23:43:18 +0200 Subject: [PATCH 082/718] Add interfaces for [cz]unhr_col and [sd]orhr_col (Reference-LAPACK PR 827) --- lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c | 24 +++++++ .../LAPACKE/src/lapacke_cunhr_col_work.c | 69 +++++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c | 24 +++++++ .../LAPACKE/src/lapacke_dorhr_col_work.c | 69 +++++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c | 24 +++++++ .../LAPACKE/src/lapacke_sorhr_col_work.c | 69 +++++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c | 24 +++++++ .../LAPACKE/src/lapacke_zunhr_col_work.c | 69 +++++++++++++++++++ 8 files changed, 372 insertions(+) create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c new file mode 100644 index 000000000..7ed1ad4c4 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c @@ -0,0 +1,24 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_cunhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_float* a, + lapack_int lda, lapack_complex_float* t, + lapack_int ldt, lapack_complex_float* d) +{ + lapack_int info = 0; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cunhr_col", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -5; + } + } +#endif + /* Call middle-level interface */ + info = LAPACKE_cunhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d ); + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c new file mode 100644 index 000000000..76b8366f0 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c @@ -0,0 +1,69 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_cunhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_float* a, + lapack_int lda, lapack_complex_float* t, + lapack_int ldt, lapack_complex_float* d ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_cunhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldt_t = MAX(1,MIN(nb,n)); + lapack_complex_float* a_t = NULL; + lapack_complex_float* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -6; + LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info ); + return info; + } + if( ldt < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * + ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cunhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t, + ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c new file mode 100644 index 000000000..1f37725e9 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c @@ -0,0 +1,24 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dorhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, double* a, + lapack_int lda, double* t, + lapack_int ldt, double* d) +{ + lapack_int info = 0; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dorhr_col", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -5; + } + } +#endif + /* Call middle-level interface */ + info = LAPACKE_dorhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d ); + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c new file mode 100644 index 000000000..28b80cc02 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c @@ -0,0 +1,69 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dorhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, double* a, + lapack_int lda, double* t, + lapack_int ldt, double* d ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dorhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldt_t = MAX(1,MIN(nb,n)); + double* a_t = NULL; + double* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -6; + LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info ); + return info; + } + if( ldt < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*) + LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (double*) + LAPACKE_malloc( sizeof(double) * + ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dorhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t, + ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c new file mode 100644 index 000000000..60e6e7951 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c @@ -0,0 +1,24 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, float* a, + lapack_int lda, float* t, + lapack_int ldt, float* d) +{ + lapack_int info = 0; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sorhr_col", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -5; + } + } +#endif + /* Call middle-level interface */ + info = LAPACKE_sorhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d ); + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c new file mode 100644 index 000000000..56d6a965e --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c @@ -0,0 +1,69 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_sorhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, float* a, + lapack_int lda, float* t, + lapack_int ldt, float* d ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_sorhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldt_t = MAX(1,MIN(nb,n)); + float* a_t = NULL; + float* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -6; + LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info ); + return info; + } + if( ldt < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*) + LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (float*) + LAPACKE_malloc( sizeof(float) * + ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sorhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t, + ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c new file mode 100644 index 000000000..7e2507daf --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c @@ -0,0 +1,24 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_zunhr_col( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_double* a, + lapack_int lda, lapack_complex_double* t, + lapack_int ldt, lapack_complex_double* d) +{ + lapack_int info = 0; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zunhr_col", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -5; + } + } +#endif + /* Call middle-level interface */ + info = LAPACKE_zunhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d ); + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c new file mode 100644 index 000000000..b5e640177 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c @@ -0,0 +1,69 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_zunhr_col_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int nb, lapack_complex_double* a, + lapack_int lda, lapack_complex_double* t, + lapack_int ldt, lapack_complex_double* d ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_zunhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldt_t = MAX(1,MIN(nb,n)); + lapack_complex_double* a_t = NULL; + lapack_complex_double* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -6; + LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info ); + return info; + } + if( ldt < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * + ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zunhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t, + ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info ); + } + return info; +} From 2182cc0ff5bf5b77701d6aab19e4942ef2c6061b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 May 2023 23:48:21 +0200 Subject: [PATCH 083/718] Add interfaces for [cz]unhr_col and [sd]orhr_col (Reference-LAPACK PR 827) --- lapack-netlib/LAPACKE/src/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack-netlib/LAPACKE/src/CMakeLists.txt b/lapack-netlib/LAPACKE/src/CMakeLists.txt index 4171a3bd4..89890f923 100644 --- a/lapack-netlib/LAPACKE/src/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/src/CMakeLists.txt @@ -589,6 +589,8 @@ lapacke_cungtr.c lapacke_cungtr_work.c lapacke_cungtsqr_row.c lapacke_cungtsqr_row_work.c +lapacke_cunhr_col.c +lapacke_cunhr_col_work.c lapacke_cunmbr.c lapacke_cunmbr_work.c lapacke_cunmhr.c @@ -857,6 +859,8 @@ lapacke_dorgtr.c lapacke_dorgtr_work.c lapacke_dorgtsqr_row.c lapacke_dorgtsqr_row_work.c +lapacke_dorhr_col.c +lapacke_dorhr_col_work.c lapacke_dormbr.c lapacke_dormbr_work.c lapacke_dormhr.c @@ -1432,6 +1436,8 @@ lapacke_sorgtr.c lapacke_sorgtr_work.c lapacke_sorgtsqr_row.c lapacke_sorgtsqr_row_work.c +lapacke_sorhr_col.c +lapacke_sorhr_col_work.c lapacke_sormbr.c lapacke_sormbr_work.c lapacke_sormhr.c @@ -2346,6 +2352,8 @@ lapacke_zungtr.c lapacke_zungtr_work.c lapacke_zungtsqr_row.c lapacke_zungtsqr_row_work.c +lapacke_zunhr_col.c +lapacke_zunhr_col_work.c lapacke_zunmbr.c lapacke_zunmbr_work.c lapacke_zunmhr.c From 58ac660d078e44fa33a8bbce4c6268b54c3cf63a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 May 2023 23:52:07 +0200 Subject: [PATCH 084/718] Add interfaces for [cz]unhr_col and [sd]orhr_col (Reference-LAPACK PR 827) --- lapack-netlib/LAPACKE/src/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 9c02c1445..d35e2a2d7 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -640,6 +640,8 @@ lapacke_cungtr.o \ lapacke_cungtr_work.o \ lapacke_cungtsqr_row.o \ lapacke_cungtsqr_row_work.o \ +lapacke_cunhr_col.o \ +lapacke_cunhr_col_work.o \ lapacke_cunmbr.o \ lapacke_cunmbr_work.o \ lapacke_cunmhr.o \ @@ -912,6 +914,8 @@ lapacke_dorgtr.o \ lapacke_dorgtr_work.o \ lapacke_dorgtsqr_row.o \ lapacke_dorgtsqr_row_work.o \ +lapacke_dorhr_col.o \ +lapacke_dorhr_col_work.o \ lapacke_dormbr.o \ lapacke_dormbr_work.o \ lapacke_dormhr.o \ @@ -1486,6 +1490,8 @@ lapacke_sorgtr.o \ lapacke_sorgtr_work.o \ lapacke_sorgtsqr_row.o \ lapacke_sorgtsqr_row_work.o \ +lapacke_sorhr_col.o \ +lapacke_sorhr_col_work.o \ lapacke_sormbr.o \ lapacke_sormbr_work.o \ lapacke_sormhr.o \ @@ -2404,6 +2410,8 @@ lapacke_zungtr.o \ lapacke_zungtr_work.o \ lapacke_zungtsqr_row.o \ lapacke_zungtsqr_row_work.o \ +lapacke_zunhr_col.o \ +lapacke_zunhr_col_work.o \ lapacke_zunmbr.o \ lapacke_zunmbr_work.o \ lapacke_zunmhr.o \ From a789211a2e1b60ef4c197709fca22606be01417e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 May 2023 23:56:27 +0200 Subject: [PATCH 085/718] Add interfaces for [cz]unhr_col and [sd]orhr_col (Reference-LAPACK PR 827) --- cmake/lapacke.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index be6a286fe..c8cfa7778 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -590,6 +590,8 @@ set(CSRC lapacke_cungtr_work.c lapacke_cungtsqr_row.c lapacke_cungtsqr_row_work.c + lapacke_cunhr_col.c + lapacke_cunhr_col_work.c lapacke_cunmbr.c lapacke_cunmbr_work.c lapacke_cunmhr.c @@ -862,6 +864,8 @@ set(DSRC lapacke_dorgtr_work.c lapacke_dorgtsqr_row.c lapacke_dorgtsqr_row_work.c + lapacke_dorhr_col.c + lapacke_dorhr_col_work.c lapacke_dormbr.c lapacke_dormbr_work.c lapacke_dormhr.c @@ -1435,6 +1439,8 @@ set(SSRC lapacke_sorgtr_work.c lapacke_sorgtsqr_row.c lapacke_sorgtsqr_row_work.c + lapacke_sorhr_col.c + lapacke_sorhr_col_work.c lapacke_sormbr.c lapacke_sormbr_work.c lapacke_sormhr.c @@ -2351,6 +2357,8 @@ set(ZSRC lapacke_zungtr_work.c lapacke_zungtsqr_row.c lapacke_zungtsqr_row_work.c + lapacke_zunhr_col.c + lapacke_zunhr_col_work.c lapacke_zunmbr.c lapacke_zunmbr_work.c lapacke_zunmhr.c From f249ccb741653b8f72783bdd20912fc2baef50ca Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Thu, 18 May 2023 23:51:37 +0800 Subject: [PATCH 086/718] Fix spr sbgemm error --- cpuid_x86.c | 8 ++++++-- kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c | 11 +++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 69cbba90e..c2486e380 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1479,6 +1479,8 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; case 15: // Sapphire Rapids + if(support_amx_bf16()) + return CPUTYPE_SAPPHIRERAPIDS; if(support_avx512_bf16()) return CPUTYPE_COOPERLAKE; if(support_avx512()) @@ -1845,7 +1847,8 @@ static char *cpuname[] = { "ZEN", "SKYLAKEX", "DHYANA", - "COOPERLAKE" + "COOPERLAKE", + "SAPPHIRERAPIDS", }; static char *lowercpuname[] = { @@ -1902,7 +1905,8 @@ static char *lowercpuname[] = { "zen", "skylakex", "dhyana", - "cooperlake" + "cooperlake", + "sapphirerapids", }; static char *corename[] = { diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c index 90e0a32c7..5ee3c8532 100644 --- a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c @@ -97,33 +97,32 @@ typedef struct { #define T_C10 6 #define T_C11 7 -// FIXME: gcc11 seem have problem in tile load/store address calc, -// need to multiply with element size (2 or 4) here. + #define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2) #define LOAD_A_TAIL(M, N) {\ __m256i ymm = _mm256_loadu_epi16(ptr_a##M); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ - _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ + _tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \ } #define MASK_LOAD_A_TAIL(M, N) {\ __m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ - _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ + _tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \ } #define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) #define LOAD_B_TAIL(M, N) {\ __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ - _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ + _tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \ } #define MASK_LOAD_B_TAIL(M, N) {\ __m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ - _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ + _tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \ } #define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N) From 0b83088887af276562728162c01f26da8e85ab22 Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Fri, 19 May 2023 00:19:50 +0800 Subject: [PATCH 087/718] spr dynamic arch support --- Makefile.system | 2 +- driver/others/dynamic.c | 34 ++++++++++++++++++++++++++++- kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 10 ++++++++- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 343b94bb3..e37b8b45a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -645,7 +645,7 @@ DYNAMIC_CORE += HASWELL ZEN endif ifneq ($(NO_AVX512), 1) ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += SKYLAKEX COOPERLAKE +DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS endif endif endif diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index f61930983..dee5538fa 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -268,9 +268,11 @@ extern gotoblas_t gotoblas_ZEN; #ifndef NO_AVX512 extern gotoblas_t gotoblas_SKYLAKEX; extern gotoblas_t gotoblas_COOPERLAKE; +extern gotoblas_t gotoblas_SAPPHIRERAPIDS; #else #define gotoblas_SKYLAKEX gotoblas_HASWELL #define gotoblas_COOPERLAKE gotoblas_HASWELL +#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL #endif #endif #else @@ -279,6 +281,7 @@ extern gotoblas_t gotoblas_COOPERLAKE; #define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_SKYLAKEX gotoblas_NEHALEM #define gotoblas_COOPERLAKE gotoblas_NEHALEM +#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA @@ -378,6 +381,31 @@ int support_avx512_bf16(){ #endif } +#define BIT_AMX_TILE 0x01000000 +#define BIT_AMX_BF16 0x00400000 +#define BIT_AMX_ENBD 0x00060000 + +int support_amx_bf16() { +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + // CPUID.7.0:EDX indicates AMX support + cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); + if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { + // CPUID.D.0:EAX[17:18] indicates AMX enabled + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) + ret = 1; + } + return ret; +#else + return 0; +#endif +} + extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" @@ -689,6 +717,8 @@ static gotoblas_t *get_coretype(void){ } } if (model == 15){ // Sapphire Rapids + if(support_amx_bf16()) + return &gotoblas_SAPPHIRERAPIDS; if(support_avx512_bf16()) return &gotoblas_COOPERLAKE; if (support_avx512()) @@ -941,7 +971,8 @@ static char *corename[] = { "Excavator", "Zen", "SkylakeX", - "Cooperlake" + "Cooperlake", + "SapphireRapids" }; char *gotoblas_corename(void) { @@ -1006,6 +1037,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_ZEN) return corename[23]; if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; if (gotoblas == &gotoblas_COOPERLAKE) return corename[25]; + if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26]; return corename[0]; } diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index 88f574668..d101503bb 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -1,6 +1,14 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE -SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c +undefine SBGEMM_SMALL_M_PERMIT +undefine SBGEMM_SMALL_K_NN +undefine SBGEMM_SMALL_K_B0_NN +undefine SBGEMM_SMALL_K_NT +undefine SBGEMM_SMALL_K_B0_NT +undefine SBGEMM_SMALL_K_TN +undefine SBGEMM_SMALL_K_B0_TN +undefine SBGEMM_SMALL_K_TT +undefine SBGEMM_SMALL_K_B0_TT SBGEMM_BETA = sgemm_beta_skylakex.c SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c From 90f041e348af30de319ddbb0e1b4ade6d74fc380 Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Fri, 19 May 2023 00:21:16 +0800 Subject: [PATCH 088/718] Invoke the syscall to allow the use of amx tiles --- common_param.h | 1 + interface/gemm.c | 31 +++++++++++++++++++++++++++++++ kernel/setparam-ref.c | 7 +++++++ 3 files changed, 39 insertions(+) diff --git a/common_param.h b/common_param.h index 8b39ca2fc..c082d248e 100644 --- a/common_param.h +++ b/common_param.h @@ -53,6 +53,7 @@ typedef struct { int sbgemm_p, sbgemm_q, sbgemm_r; int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; int sbgemm_align_k; + int need_amxtile_permission; // 0 default, 1 for device support amx. void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); diff --git a/interface/gemm.c b/interface/gemm.c index 71cc77a1b..285b99eb9 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = { #endif #endif +#if defined(__linux__) && defined(BFLOAT16) +#define XFEATURE_XTILEDATA 18 +#define ARCH_REQ_XCOMP_PERM 0x1023 +static int openblas_amxtile_permission = 0; +static int init_amxtile_permission() { + long status = + syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA); + if (status != 0) { + fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, " + "Intel Sapphier Rapids), skip sbgemm calculation\n"); + return -1; + } + openblas_amxtile_permission = 1; + return 0; +} +#endif + #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, @@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif +#if defined(__linux__) && defined(BFLOAT16) +#if defined(DYNAMIC_ARCH) + if (gotoblas->need_amxtile_permission && + openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { + return; + } +#endif +#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS) + if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { + return; + } +#endif +#endif // defined(__linux__) && defined(BFLOAT16) + if ((args.m == 0) || (args.n == 0)) return; #if 0 diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 7832c0a87..4c361f155 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -66,6 +66,7 @@ gotoblas_t TABLE_NAME = { #endif SBGEMM_ALIGN_K, + 0, // need_amxtile_permission sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, @@ -1809,6 +1810,12 @@ static void init_parameter(void) { #endif +#ifdef SAPPHIRERAPIDS +#if (BUILD_BFLOAT16 == 1) + TABLE_NAME.need_amxtile_permission = 1; +#endif +#endif + #if BUILD_COMPLEX==1 #ifdef CGEMM3M_DEFAULT_P TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; From 18d7afe69daa196902cd68b63cc381aaafc9d26e Mon Sep 17 00:00:00 2001 From: sh-zheng <2294474733@qq.com> Date: Sat, 20 May 2023 01:19:44 +0800 Subject: [PATCH 089/718] Add rvv support for zsymv and active rvv support for zhemv --- kernel/riscv64/KERNEL.x280 | 17 ++- kernel/riscv64/zhemv_LM_rvv.c | 198 +++++++++++++++++++++++++++++++++ kernel/riscv64/zhemv_UV_rvv.c | 199 ++++++++++++++++++++++++++++++++++ kernel/riscv64/zsymv_L_rvv.c | 179 ++++++++++++++++++++++++++++++ kernel/riscv64/zsymv_U_rvv.c | 177 ++++++++++++++++++++++++++++++ 5 files changed, 766 insertions(+), 4 deletions(-) create mode 100644 kernel/riscv64/zhemv_LM_rvv.c create mode 100644 kernel/riscv64/zhemv_UV_rvv.c create mode 100644 kernel/riscv64/zsymv_L_rvv.c create mode 100644 kernel/riscv64/zsymv_U_rvv.c diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 217d8534e..86708fe01 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -225,10 +225,19 @@ SSYMV_U_KERNEL = symv_U_rvv.c SSYMV_L_KERNEL = symv_L_rvv.c DSYMV_U_KERNEL = symv_U_rvv.c DSYMV_L_KERNEL = symv_L_rvv.c -CSYMV_U_KERNEL = ../generic/zsymv_k.c -CSYMV_L_KERNEL = ../generic/zsymv_k.c -ZSYMV_U_KERNEL = ../generic/zsymv_k.c -ZSYMV_L_KERNEL = ../generic/zsymv_k.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c new file mode 100644 index 000000000..e025120e5 --- /dev/null +++ b/kernel/riscv64/zhemv_LM_rvv.c @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + jx = 0; + jy = 0; + ja = 0; + for(j = 0; j < offset; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + ix = jx + inc_x2; + iy = jy + inc_y2; + ia = ja + 2; + i = j + 1; + len = m - i; + if(len > 0){ + gvl = VSETVL(len); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); + if(i < m){ + gvl = VSETVL(m-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 += VFMVFS_FLOAT(v_res); + } + } + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c new file mode 100644 index 000000000..0e1ea5436 --- /dev/null +++ b/kernel/riscv64/zhemv_UV_rvv.c @@ -0,0 +1,199 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + BLASLONG m1 = m - offset; + a_ptr = a + m1 * lda2; + jx = m1 * inc_x2; + jy = m1 * inc_y2; + ja = m1 * 2; + for(j = m1; j < m; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + ix = 0; + iy = 0; + ia = 0; + i = 0; + if(j > 0){ + gvl = VSETVL(j); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); + if(i < j){ + gvl = VSETVL(j-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 += VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 += VFMVFS_FLOAT(v_res); + } + } + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c new file mode 100644 index 000000000..3bf621094 --- /dev/null +++ b/kernel/riscv64/zsymv_L_rvv.c @@ -0,0 +1,179 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#define VFNEGV_FLOAT __riscv_vfneg_v_f32mf4 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#define VFNEGV_FLOAT __riscv_vfneg_v_f64mf4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1[2]; + FLOAT temp2[2]; + FLOAT *a_ptr = a; + BLASLONG gvl = VSETVL_MAX; + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i; + BLASLONG stride_x, stride_y, inc_xv, inc_yv, len; + + stride_x = 2 * inc_x * sizeof(FLOAT); + stride_y = 2 * inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0){ + gvl = VSETVL(len); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr_r = VFMVVF_FLOAT(0, gvl); + vr_i = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); + + if(i < m){ + gvl = VSETVL(m-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] += VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] += VFMVFS_FLOAT_M1(v_res); + } + } + y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1]; + y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0]; + + jx += inc_x; + jy += inc_y; + a_ptr += 2 * lda; + } + + return(0); +} + diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c new file mode 100644 index 000000000..de1564f75 --- /dev/null +++ b/kernel/riscv64/zsymv_U_rvv.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, k; + BLASLONG ix,iy; + BLASLONG jx,jy; + FLOAT temp1[2]; + FLOAT temp2[2]; + FLOAT *a_ptr = a; + BLASLONG gvl = VSETVL_MAX; + FLOAT_V_T_M1 v_res, v_z0; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + + FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i; + BLASLONG stride_x, stride_y, inc_xv, inc_yv; + + BLASLONG m1 = m - offset; + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = 2 * inc_x * sizeof(FLOAT); + stride_y = 2 * inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + iy = 0; + i = 0; + gvl = VSETVL(j); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr_r = VFMVVF_FLOAT(0, gvl); + vr_i = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); + + if(i < j){ + gvl = VSETVL(j-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); + vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); + vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); + vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); + vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] += VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] += VFMVFS_FLOAT_M1(v_res); + } + } + + y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1]; + y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1] + alpha_r * temp2[1] + alpha_i * temp2[0]; + + a_ptr += 2 * lda; + jx += inc_x; + jy += inc_y; + } + + return(0); +} + From a76afdc0474eb35bd8986a78683b3131767e0a30 Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Fri, 19 May 2023 16:22:01 +0800 Subject: [PATCH 090/718] Compatible with older version of GNU make --- kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index d101503bb..3a832e917 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -1,14 +1,14 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE -undefine SBGEMM_SMALL_M_PERMIT -undefine SBGEMM_SMALL_K_NN -undefine SBGEMM_SMALL_K_B0_NN -undefine SBGEMM_SMALL_K_NT -undefine SBGEMM_SMALL_K_B0_NT -undefine SBGEMM_SMALL_K_TN -undefine SBGEMM_SMALL_K_B0_TN -undefine SBGEMM_SMALL_K_TT -undefine SBGEMM_SMALL_K_B0_TT +SBGEMM_SMALL_M_PERMIT = +SBGEMM_SMALL_K_NN = +SBGEMM_SMALL_K_B0_NN = +SBGEMM_SMALL_K_NT = +SBGEMM_SMALL_K_B0_NT = +SBGEMM_SMALL_K_TN = +SBGEMM_SMALL_K_B0_TN = +SBGEMM_SMALL_K_TT = +SBGEMM_SMALL_K_B0_TT = SBGEMM_BETA = sgemm_beta_skylakex.c SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c From d96d386d138a78e806c133b9b61d4b1e37163d1b Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Sat, 20 May 2023 14:13:49 +0800 Subject: [PATCH 091/718] Add CMake dynamic build for Sapphire Rapids --- cmake/arch.cmake | 2 +- cmake/utils.cmake | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 8521f3988..f70019800 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -82,7 +82,7 @@ if (DYNAMIC_ARCH) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) endif () if (NOT NO_AVX512) - set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE) + set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS) string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") endif () if (DYNAMIC_LIST) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 56c1cb060..5e8ba866b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN) #message(STATUS "skipping ${makefile_line}") continue () endif () + + # Example 1: SBGEMM_SMALL_M_PERMIT = + # Unset the variable + string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + set(var_name ${CMAKE_MATCH_1}) + unset(${var_name}) + endif() + string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") #message(STATUS "match on ${line_match}") From 9e80a194d6da44bae67091edbf2d3989d1542d8a Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Sun, 21 May 2023 19:52:58 +0800 Subject: [PATCH 092/718] Fix dynamic_list build and gcc version check error --- driver/others/dynamic.c | 13 +++++++++++++ kernel/Makefile | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index dee5538fa..8e0f53f74 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE; #else #define gotoblas_COOPERLAKE gotoblas_PRESCOTT #endif +#ifdef DYN_SAPPHIRERAPIDS +extern gotoblas_t gotoblas_SAPPHIRERAPIDS; +#elif defined(DYN_SKYLAKEX) +#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX +#elif defined(DYN_HASWELL) +#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM +#else +#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT +#endif #else // not DYNAMIC_LIST diff --git a/kernel/Makefile b/kernel/Makefile index 977886044..d426a1bdb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -33,7 +33,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) - ifeq ($(GCCVERSIONGTEQ10), 1) + ifeq ($(GCCVERSIONGTEQ11), 1) override CFLAGS += -march=sapphirerapids else override CFLAGS += -march=skylake-avx512 -mavx512f From 71e4125795cf4a873f486fbb6e8a7711d717e803 Mon Sep 17 00:00:00 2001 From: Honglin Zhu Date: Mon, 22 May 2023 21:59:59 +0800 Subject: [PATCH 093/718] Fix syscall error on non-x86 platform --- interface/gemm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index 285b99eb9..4778b641b 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -154,7 +154,7 @@ static size_t zgemm_small_kernel_b0[] = { #endif #endif -#if defined(__linux__) && defined(BFLOAT16) +#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) #define XFEATURE_XTILEDATA 18 #define ARCH_REQ_XCOMP_PERM 0x1023 static int openblas_amxtile_permission = 0; @@ -472,7 +472,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif -#if defined(__linux__) && defined(BFLOAT16) +#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) #if defined(DYNAMIC_ARCH) if (gotoblas->need_amxtile_permission && openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { @@ -484,7 +484,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS return; } #endif -#endif // defined(__linux__) && defined(BFLOAT16) +#endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) if ((args.m == 0) || (args.n == 0)) return; From d3bf5a5401e623e107a23fb70151c7102cbd14c7 Mon Sep 17 00:00:00 2001 From: sh-zheng <2294474733@qq.com> Date: Mon, 22 May 2023 22:39:45 +0800 Subject: [PATCH 094/718] Combine two reduction operations of zhe/symv into one, with tail undisturbed setted. --- kernel/riscv64/zhemv_LM_rvv.c | 68 +++++++++++++++++------------------ kernel/riscv64/zhemv_UV_rvv.c | 68 +++++++++++++++++------------------ kernel/riscv64/zsymv_L_rvv.c | 50 +++++++++++++------------- kernel/riscv64/zsymv_U_rvv.c | 52 +++++++++++++-------------- 4 files changed, 119 insertions(+), 119 deletions(-) diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c index e025120e5..95c6a377c 100644 --- a/kernel/riscv64/zhemv_LM_rvv.c +++ b/kernel/riscv64/zhemv_LM_rvv.c @@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m1() @@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -143,49 +147,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 = VFMVFS_FLOAT(v_res); + if(i < m){ - gvl = VSETVL(m-i); - va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); - va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); - vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); - vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); + unsigned int gvl_rem = VSETVL(m-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); #ifndef HEMVREV - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #else - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #endif - VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); - VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); #ifndef HEMVREV - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #else - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #endif - - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 += VFMVFS_FLOAT(v_res); } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); } y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c index 0e1ea5436..ec06622fc 100644 --- a/kernel/riscv64/zhemv_UV_rvv.c +++ b/kernel/riscv64/zhemv_UV_rvv.c @@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m1() @@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -142,49 +146,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 = VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 = VFMVFS_FLOAT(v_res); + if(i < j){ - gvl = VSETVL(j-i); - va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); - va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); - vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); - vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); + unsigned int gvl_rem = VSETVL(j-i); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); #ifndef HEMVREV - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #else - vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); - vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); - vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); - vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); #endif - VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); - VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); - vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); #ifndef HEMVREV - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #else - vr0 = VFMULVV_FLOAT(vx0, va0, gvl); - vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); - vr1 = VFMULVV_FLOAT(vx1, va0, gvl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); #endif - - v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); - temp_r2 += VFMVFS_FLOAT(v_res); - v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); - temp_i2 += VFMVFS_FLOAT(v_res); } + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); + temp_r2 = VFMVFS_FLOAT(v_res); + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); + temp_i2 = VFMVFS_FLOAT(v_res); } y[jy] += temp_r1 * a_ptr[ja]; y[jy+1] += temp_i1 * a_ptr[ja]; diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c index 3bf621094..cefdea7f6 100644 --- a/kernel/riscv64/zsymv_L_rvv.c +++ b/kernel/riscv64/zsymv_L_rvv.c @@ -38,6 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -57,6 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -133,38 +137,34 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] = VFMVFS_FLOAT_M1(v_res); if(i < m){ - gvl = VSETVL(m-i); - vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); - vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); - va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); - va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + unsigned int gvl_rem = VSETVL(m-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem); - vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); - vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem); - VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); - VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem); - vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); - vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); - vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); - vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); - vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); - vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem); + vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem); + vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem); - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] += VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] += VFMVFS_FLOAT_M1(v_res); } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); } y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1]; y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0]; diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c index de1564f75..67b5a649c 100644 --- a/kernel/riscv64/zsymv_U_rvv.c +++ b/kernel/riscv64/zsymv_U_rvv.c @@ -38,6 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -56,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -129,39 +133,35 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, ix += inc_xv; iy += inc_yv; } - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] = VFMVFS_FLOAT_M1(v_res); if(i < j){ - gvl = VSETVL(j-i); - vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl); - vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl); + unsigned int gvl_rem = VSETVL(j-i); + vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem); + vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem); - va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl); - va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl); + va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem); + va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem); - vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl); - vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl); - vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl); + vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem); + vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem); + vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem); - VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl); - VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl); + VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem); + VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem); - vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl); - vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl); - vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl); - vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl); - vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl); - vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl); + vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem); + vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem); + vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem); + vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem); + vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem); - v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); - temp2[0] += VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); - temp2[1] += VFMVFS_FLOAT_M1(v_res); - } + } + v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl); + temp2[0] = VFMVFS_FLOAT_M1(v_res); + v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl); + temp2[1] = VFMVFS_FLOAT_M1(v_res); } y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1]; From c0da3af13ba13f42628136503e1047d206c7257f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 May 2023 11:37:32 +0200 Subject: [PATCH 095/718] Fix segfault when executing with row major --- lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c index 962624d21..3f8f0cf17 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c @@ -83,7 +83,7 @@ lapack_int LAPACKE_cgeqrt_work( int matrix_layout, lapack_int m, lapack_int n, } /* Transpose output matrices */ LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); - LAPACKE_cge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t, + LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t, ldt ); /* Release memory and exit */ LAPACKE_free( t_t ); From b0a72586a32a06b725c99c1ff84ff0f9d7b54eec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 May 2023 11:39:20 +0200 Subject: [PATCH 096/718] fix segfault when executing with row major --- lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c index 8a4c7cead..000c94e0b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c @@ -80,7 +80,7 @@ lapack_int LAPACKE_dgeqrt_work( int matrix_layout, lapack_int m, lapack_int n, } /* Transpose output matrices */ LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); - LAPACKE_dge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t, + LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t, ldt ); /* Release memory and exit */ LAPACKE_free( t_t ); From abbc387145670cba345edd02ecfee7310e7422de Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 May 2023 11:40:25 +0200 Subject: [PATCH 097/718] fix segfault when executing with row major --- lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c index 00ca6e501..d357845ae 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_sgeqrt_work( int matrix_layout, lapack_int m, lapack_int n, } /* Transpose output matrices */ LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); - LAPACKE_sge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t, + LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t, ldt ); /* Release memory and exit */ LAPACKE_free( t_t ); From 0d72705adc8bc917bc4ecc1b299ba7cad750e999 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 May 2023 11:41:28 +0200 Subject: [PATCH 098/718] fix segfault when executing with row major (Reference-LAPACK 768) --- lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c index 8f6f1ed0b..633111533 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c @@ -83,7 +83,7 @@ lapack_int LAPACKE_zgeqrt_work( int matrix_layout, lapack_int m, lapack_int n, } /* Transpose output matrices */ LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); - LAPACKE_zge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t, + LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t, ldt ); /* Release memory and exit */ LAPACKE_free( t_t ); From e2f4adfc608a47fab120dd164c2712e2c18b759f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 May 2023 16:51:14 +0200 Subject: [PATCH 099/718] fix scaling (Reference-LAPACK PR 830) --- lapack-netlib/SRC/dtgsna.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/dtgsna.f b/lapack-netlib/SRC/dtgsna.f index b9b3ad8af..013dc91bd 100644 --- a/lapack-netlib/SRC/dtgsna.f +++ b/lapack-netlib/SRC/dtgsna.f @@ -632,8 +632,8 @@ C1 = TWO*( ALPHAR*ALPHAR+ALPHAI*ALPHAI+BETA*BETA ) C2 = FOUR*BETA*BETA*ALPHAI*ALPHAI ROOT1 = C1 + SQRT( C1*C1-4.0D0*C2 ) - ROOT2 = C2 / ROOT1 ROOT1 = ROOT1 / TWO + ROOT2 = C2 / ROOT1 COND = MIN( SQRT( ROOT1 ), SQRT( ROOT2 ) ) END IF * From 912bfd9bfbabef50cd0399c142c52b7cc350291c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 May 2023 16:52:56 +0200 Subject: [PATCH 100/718] fix scaling (Reference-LAPACK PR 830) --- lapack-netlib/SRC/stgsna.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/stgsna.f b/lapack-netlib/SRC/stgsna.f index 40f822915..430f3c4b7 100644 --- a/lapack-netlib/SRC/stgsna.f +++ b/lapack-netlib/SRC/stgsna.f @@ -632,8 +632,8 @@ C1 = TWO*( ALPHAR*ALPHAR+ALPHAI*ALPHAI+BETA*BETA ) C2 = FOUR*BETA*BETA*ALPHAI*ALPHAI ROOT1 = C1 + SQRT( C1*C1-4.0*C2 ) - ROOT2 = C2 / ROOT1 ROOT1 = ROOT1 / TWO + ROOT2 = C2 / ROOT1 COND = MIN( SQRT( ROOT1 ), SQRT( ROOT2 ) ) END IF * From e728744bd66bb9c10f24cc1daa0c65a535bab533 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 May 2023 22:30:45 +0200 Subject: [PATCH 101/718] Add comments on handling of zero-sized B matrix in ?GELSY (Reference-LAPACK PR 833) (#4056) * Add comment related to zero-sized matrix B --- lapack-netlib/SRC/cgelsy.f | 2 ++ lapack-netlib/SRC/dgelsy.f | 2 ++ lapack-netlib/SRC/sgelsy.f | 2 ++ lapack-netlib/SRC/zgelsy.f | 2 ++ 4 files changed, 8 insertions(+) diff --git a/lapack-netlib/SRC/cgelsy.f b/lapack-netlib/SRC/cgelsy.f index 67140f191..64bb10023 100644 --- a/lapack-netlib/SRC/cgelsy.f +++ b/lapack-netlib/SRC/cgelsy.f @@ -116,6 +116,7 @@ *> B is COMPLEX array, dimension (LDB,NRHS) *> On entry, the M-by-NRHS right hand side matrix B. *> On exit, the N-by-NRHS solution matrix X. +*> If M = 0 or N = 0, B is not referenced. *> \endverbatim *> *> \param[in] LDB @@ -148,6 +149,7 @@ *> The effective rank of A, i.e., the order of the submatrix *> R11. This is the same as the order of the submatrix T11 *> in the complete orthogonal factorization of A. +*> If NRHS = 0, RANK = 0 on output. *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/dgelsy.f b/lapack-netlib/SRC/dgelsy.f index aebab9264..e9fcd9682 100644 --- a/lapack-netlib/SRC/dgelsy.f +++ b/lapack-netlib/SRC/dgelsy.f @@ -115,6 +115,7 @@ *> B is DOUBLE PRECISION array, dimension (LDB,NRHS) *> On entry, the M-by-NRHS right hand side matrix B. *> On exit, the N-by-NRHS solution matrix X. +*> If M = 0 or N = 0, B is not referenced. *> \endverbatim *> *> \param[in] LDB @@ -147,6 +148,7 @@ *> The effective rank of A, i.e., the order of the submatrix *> R11. This is the same as the order of the submatrix T11 *> in the complete orthogonal factorization of A. +*> If NRHS = 0, RANK = 0 on output. *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/sgelsy.f b/lapack-netlib/SRC/sgelsy.f index 9c60f78a7..89dd39e80 100644 --- a/lapack-netlib/SRC/sgelsy.f +++ b/lapack-netlib/SRC/sgelsy.f @@ -115,6 +115,7 @@ *> B is REAL array, dimension (LDB,NRHS) *> On entry, the M-by-NRHS right hand side matrix B. *> On exit, the N-by-NRHS solution matrix X. +*> If M = 0 or N = 0, B is not referenced. *> \endverbatim *> *> \param[in] LDB @@ -147,6 +148,7 @@ *> The effective rank of A, i.e., the order of the submatrix *> R11. This is the same as the order of the submatrix T11 *> in the complete orthogonal factorization of A. +*> If NRHS = 0, RANK = 0 on output. *> \endverbatim *> *> \param[out] WORK diff --git a/lapack-netlib/SRC/zgelsy.f b/lapack-netlib/SRC/zgelsy.f index 65fa87ae9..497becf8b 100644 --- a/lapack-netlib/SRC/zgelsy.f +++ b/lapack-netlib/SRC/zgelsy.f @@ -116,6 +116,7 @@ *> B is COMPLEX*16 array, dimension (LDB,NRHS) *> On entry, the M-by-NRHS right hand side matrix B. *> On exit, the N-by-NRHS solution matrix X. +*> If M = 0 or N = 0, B is not referenced. *> \endverbatim *> *> \param[in] LDB @@ -148,6 +149,7 @@ *> The effective rank of A, i.e., the order of the submatrix *> R11. This is the same as the order of the submatrix T11 *> in the complete orthogonal factorization of A. +*> If NRHS = 0, RANK = 0 on output. *> \endverbatim *> *> \param[out] WORK From 490ce5199b7e13fea34e203eddceff4cd350d213 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 May 2023 10:22:24 +0200 Subject: [PATCH 102/718] Update brief description of function purpose (Reference-LAPACK 831) --- lapack-netlib/SRC/DEPRECATED/cgegs.f | 2 +- lapack-netlib/SRC/DEPRECATED/cgegv.f | 2 +- lapack-netlib/SRC/DEPRECATED/dgegs.f | 2 +- lapack-netlib/SRC/DEPRECATED/dgegv.f | 2 +- lapack-netlib/SRC/DEPRECATED/sgegs.f | 2 +- lapack-netlib/SRC/DEPRECATED/sgegv.f | 2 +- lapack-netlib/SRC/DEPRECATED/zgegs.f | 2 +- lapack-netlib/SRC/DEPRECATED/zgegv.f | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/DEPRECATED/cgegs.f b/lapack-netlib/SRC/DEPRECATED/cgegs.f index 1f0791a20..b6adf9111 100644 --- a/lapack-netlib/SRC/DEPRECATED/cgegs.f +++ b/lapack-netlib/SRC/DEPRECATED/cgegs.f @@ -1,4 +1,4 @@ -*> \brief CGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief CGEGS computes the eigenvalues, Schur form, and, optionally, the left and or/right Schur vectors of a complex matrix pair (A,B) * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/cgegv.f b/lapack-netlib/SRC/DEPRECATED/cgegv.f index ba810ddef..d2b254255 100644 --- a/lapack-netlib/SRC/DEPRECATED/cgegv.f +++ b/lapack-netlib/SRC/DEPRECATED/cgegv.f @@ -1,4 +1,4 @@ -*> \brief CGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief CGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a complex matrix pair (A,B). * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/dgegs.f b/lapack-netlib/SRC/DEPRECATED/dgegs.f index 0ac0112c2..02e9fdcb2 100644 --- a/lapack-netlib/SRC/DEPRECATED/dgegs.f +++ b/lapack-netlib/SRC/DEPRECATED/dgegs.f @@ -1,4 +1,4 @@ -*> \brief DGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief DGEGS computes the eigenvalues, real Schur form, and, optionally, the left and/or right Schur vectors of a real matrix pair (A,B) * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/dgegv.f b/lapack-netlib/SRC/DEPRECATED/dgegv.f index 7e81c85a9..0b5c48922 100644 --- a/lapack-netlib/SRC/DEPRECATED/dgegv.f +++ b/lapack-netlib/SRC/DEPRECATED/dgegv.f @@ -1,4 +1,4 @@ -*> \brief DGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief DGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a real matrix pair (A,B). * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/sgegs.f b/lapack-netlib/SRC/DEPRECATED/sgegs.f index 2ed9ad942..11ecc67ac 100644 --- a/lapack-netlib/SRC/DEPRECATED/sgegs.f +++ b/lapack-netlib/SRC/DEPRECATED/sgegs.f @@ -1,4 +1,4 @@ -*> \brief SGEGS computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief SGEGS computes the eigenvalues, real Schur form, and, optionally, the left and/or right Schur vectors of a real matrix pair (A,B) * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/sgegv.f b/lapack-netlib/SRC/DEPRECATED/sgegv.f index 7a179a499..97556e371 100644 --- a/lapack-netlib/SRC/DEPRECATED/sgegv.f +++ b/lapack-netlib/SRC/DEPRECATED/sgegv.f @@ -1,4 +1,4 @@ -*> \brief SGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief SGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a real matrix pair (A,B). * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/zgegs.f b/lapack-netlib/SRC/DEPRECATED/zgegs.f index c5cdd26e5..23f8d43d1 100644 --- a/lapack-netlib/SRC/DEPRECATED/zgegs.f +++ b/lapack-netlib/SRC/DEPRECATED/zgegs.f @@ -1,4 +1,4 @@ -*> \brief ZGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief ZGEGS computes the eigenvalues, Schur form, and, optionally, the left and or/right Schur vectors of a complex matrix pair (A,B) * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/DEPRECATED/zgegv.f b/lapack-netlib/SRC/DEPRECATED/zgegv.f index aa4ab3f71..542d3f4ff 100644 --- a/lapack-netlib/SRC/DEPRECATED/zgegv.f +++ b/lapack-netlib/SRC/DEPRECATED/zgegv.f @@ -1,4 +1,4 @@ -*> \brief ZGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices +*> \brief ZGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a complex matrix pair (A,B). * * =========== DOCUMENTATION =========== * From 447368783aca067d6bbbc057f4528023eb3567bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 May 2023 13:06:15 +0200 Subject: [PATCH 103/718] Add quick return for N < 1 (Reference-LAPACK PR 837) --- lapack-netlib/SRC/dlaruv.f | 5 +++++ lapack-netlib/SRC/slaruv.f | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/lapack-netlib/SRC/dlaruv.f b/lapack-netlib/SRC/dlaruv.f index 0f5c9541d..1a4ce4009 100644 --- a/lapack-netlib/SRC/dlaruv.f +++ b/lapack-netlib/SRC/dlaruv.f @@ -382,6 +382,11 @@ $ 1537 / * .. * .. Executable Statements .. +* +* Quick return for N < 1 + IF ( N < 1 ) THEN + RETURN + END IF * I1 = ISEED( 1 ) I2 = ISEED( 2 ) diff --git a/lapack-netlib/SRC/slaruv.f b/lapack-netlib/SRC/slaruv.f index c25dc2e2f..cd37a1c47 100644 --- a/lapack-netlib/SRC/slaruv.f +++ b/lapack-netlib/SRC/slaruv.f @@ -382,6 +382,11 @@ $ 1537 / * .. * .. Executable Statements .. +* +* Quick return for N < 1 + IF ( N < 1 ) THEN + RETURN + END IF * I1 = ISEED( 1 ) I2 = ISEED( 2 ) From e09055330e88cecda4d45e380637a60e230f3c68 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 May 2023 14:14:17 +0200 Subject: [PATCH 104/718] run apt-get update before fetching cross-compilers --- .github/workflows/dynamic_arch.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index b537c8494..c16f87954 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -310,6 +310,7 @@ jobs: - name: Install Dependencies run: | + sudo apt-get update sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross - name: Compilation cache From 899c3a6f6a801fef080962221755eac8d543d5df Mon Sep 17 00:00:00 2001 From: Angelika Schwarz <17718454+angsch@users.noreply.github.com> Date: Thu, 27 Apr 2023 17:59:15 +0200 Subject: [PATCH 105/718] Improve input argument checks of gemmt * Fix return value for invalid info * Add missing checks for ldA, ldB * Use reference-LAPACK like checks (ie ld=0,nrows=0 is invalid) --- interface/gemmt.c | 73 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index cebc7918d..046432670 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -77,6 +77,7 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, blasint info; char transA, transB, Uplo; + blasint nrowa, nrowb; IFLOAT *buffer; IFLOAT *aa, *bb; FLOAT *cc; @@ -155,22 +156,31 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, if (Uplo == 'L') uplo = 1; + nrowa = m; + if (transa) nrowa = k; + nrowb = k; + if (transb) nrowb = m; + info = 0; - if (uplo < 0) - info = 14; - if (ldc < m) + if (ldc < MAX(1, m)) info = 13; + if (ldb < MAX(1, nrowa)) + info = 10; + if (lda < MAX(1, nrowb)) + info = 8; if (k < 0) info = 5; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; - if (info) { + if (info != 0) { BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } @@ -205,11 +215,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, PRINT_DEBUG_CNAME; + uplo = -1; transa = -1; transb = -1; info = 0; if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) transa = 0; @@ -249,15 +262,27 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - if (ldc < m) + blasint nrowa, nrowb; + nrowa = m; + if (transa) nrowa = k; + nrowb = k; + if (transb) nrowb = m; + + if (ldc < MAX(1, m)) info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; if (k < 0) info = 5; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; } @@ -269,6 +294,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, lda = LDB; ldb = LDA; + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + if (TransB == CblasNoTrans) transa = 0; if (TransB == CblasTrans) @@ -302,27 +330,30 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - if (ldc < m) + blasint ncola, ncolb; + ncola = k; + if (transa) ncola = m; + ncolb = m; + if (transb) ncolb = k; + + if (ldc < MAX(1,m)) info = 13; + if (ldb < MAX(1, ncolb)) + info = 10; + if (lda < MAX(1, ncola)) + info = 8; if (k < 0) info = 5; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; - } - uplo = -1; - if (Uplo == CblasUpper) - uplo = 0; - if (Uplo == CblasLower) - uplo = 1; - if (uplo < 0) - info = 14; - if (info >= 0) { BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); return; From a757e1486f5899beb6488f9c7080fd4554b78632 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 May 2023 08:44:20 +0200 Subject: [PATCH 106/718] Fix potential segfault from not allocating work array (Reference-LAPACK 836) --- lapack-netlib/LAPACKE/src/lapacke_dgeesx.c | 15 ++++++++------- lapack-netlib/LAPACKE/src/lapacke_sgeesx.c | 15 ++++++++------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c index 7796edffc..0460b6406 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c @@ -80,10 +80,13 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort, /* Allocate memory for work arrays */ if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) { iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); - if( iwork == NULL ) { - info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; - } + } + else { + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * 1 ); + } + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; } work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); if( work == NULL ) { @@ -97,9 +100,7 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort, /* Release memory and exit */ LAPACKE_free( work ); exit_level_2: - if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) { - LAPACKE_free( iwork ); - } + LAPACKE_free( iwork ); exit_level_1: if( LAPACKE_lsame( sort, 's' ) ) { LAPACKE_free( bwork ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c index f7be44297..d2555ecc8 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c @@ -80,10 +80,13 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort, /* Allocate memory for work arrays */ if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) { iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); - if( iwork == NULL ) { - info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; - } + } + else { + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * 1 ); + } + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; } work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); if( work == NULL ) { @@ -97,9 +100,7 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort, /* Release memory and exit */ LAPACKE_free( work ); exit_level_2: - if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) { - LAPACKE_free( iwork ); - } + LAPACKE_free( iwork ); exit_level_1: if( LAPACKE_lsame( sort, 's' ) ) { LAPACKE_free( bwork ); From 92720c2f1ef8b52d57bd01e07044e61e94ce2094 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 May 2023 11:33:32 +0200 Subject: [PATCH 107/718] Remove unused variable SMIN (Reference-LAPACK 832) --- lapack-netlib/SRC/cbdsqr.f | 16 +++++++--------- lapack-netlib/SRC/dbdsqr.f | 16 +++++++--------- lapack-netlib/SRC/sbdsqr.f | 16 +++++++--------- lapack-netlib/SRC/zbdsqr.f | 16 +++++++--------- 4 files changed, 28 insertions(+), 36 deletions(-) diff --git a/lapack-netlib/SRC/cbdsqr.f b/lapack-netlib/SRC/cbdsqr.f index 1d7c4e09d..40706644e 100644 --- a/lapack-netlib/SRC/cbdsqr.f +++ b/lapack-netlib/SRC/cbdsqr.f @@ -259,7 +259,7 @@ $ NM12, NM13, OLDLL, OLDM REAL ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU, $ OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL, - $ SINR, SLL, SMAX, SMIN, SMINL, SMINOA, + $ SINR, SLL, SMAX, SMIN, SMINOA, $ SN, THRESH, TOL, TOLMUL, UNFL * .. * .. External Functions .. @@ -372,7 +372,7 @@ DO 30 I = 1, N - 1 SMAX = MAX( SMAX, ABS( E( I ) ) ) 30 CONTINUE - SMINL = ZERO + SMIN = ZERO IF( TOL.GE.ZERO ) THEN * * Relative accuracy desired @@ -426,7 +426,6 @@ IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH ) $ D( M ) = ZERO SMAX = ABS( D( M ) ) - SMIN = SMAX DO 70 LLL = 1, M - 1 LL = M - LLL ABSS = ABS( D( LL ) ) @@ -435,7 +434,6 @@ $ D( LL ) = ZERO IF( ABSE.LE.THRESH ) $ GO TO 80 - SMIN = MIN( SMIN, ABSS ) SMAX = MAX( SMAX, ABSS, ABSE ) 70 CONTINUE LL = 0 @@ -517,14 +515,14 @@ * apply convergence criterion forward * MU = ABS( D( LL ) ) - SMINL = MU + SMIN = MU DO 100 LLL = LL, M - 1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 100 CONTINUE END IF * @@ -545,14 +543,14 @@ * apply convergence criterion backward * MU = ABS( D( M ) ) - SMINL = MU + SMIN = MU DO 110 LLL = M - 1, LL, -1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 110 CONTINUE END IF END IF @@ -562,7 +560,7 @@ * Compute shift. First, test if shifting would ruin relative * accuracy, and if so set the shift to zero. * - IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE. + IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE. $ MAX( EPS, HNDRTH*TOL ) ) THEN * * Use a zero shift to avoid loss of relative accuracy diff --git a/lapack-netlib/SRC/dbdsqr.f b/lapack-netlib/SRC/dbdsqr.f index c220a5875..bc697a007 100644 --- a/lapack-netlib/SRC/dbdsqr.f +++ b/lapack-netlib/SRC/dbdsqr.f @@ -278,7 +278,7 @@ $ MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM DOUBLE PRECISION ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU, $ OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL, - $ SINR, SLL, SMAX, SMIN, SMINL, SMINOA, + $ SINR, SLL, SMAX, SMIN, SMINOA, $ SN, THRESH, TOL, TOLMUL, UNFL * .. * .. External Functions .. @@ -391,7 +391,7 @@ DO 30 I = 1, N - 1 SMAX = MAX( SMAX, ABS( E( I ) ) ) 30 CONTINUE - SMINL = ZERO + SMIN = ZERO IF( TOL.GE.ZERO ) THEN * * Relative accuracy desired @@ -451,7 +451,6 @@ IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH ) $ D( M ) = ZERO SMAX = ABS( D( M ) ) - SMIN = SMAX DO 70 LLL = 1, M - 1 LL = M - LLL ABSS = ABS( D( LL ) ) @@ -460,7 +459,6 @@ $ D( LL ) = ZERO IF( ABSE.LE.THRESH ) $ GO TO 80 - SMIN = MIN( SMIN, ABSS ) SMAX = MAX( SMAX, ABSS, ABSE ) 70 CONTINUE LL = 0 @@ -542,14 +540,14 @@ * apply convergence criterion forward * MU = ABS( D( LL ) ) - SMINL = MU + SMIN = MU DO 100 LLL = LL, M - 1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 100 CONTINUE END IF * @@ -570,14 +568,14 @@ * apply convergence criterion backward * MU = ABS( D( M ) ) - SMINL = MU + SMIN = MU DO 110 LLL = M - 1, LL, -1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 110 CONTINUE END IF END IF @@ -587,7 +585,7 @@ * Compute shift. First, test if shifting would ruin relative * accuracy, and if so set the shift to zero. * - IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE. + IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE. $ MAX( EPS, HNDRTH*TOL ) ) THEN * * Use a zero shift to avoid loss of relative accuracy diff --git a/lapack-netlib/SRC/sbdsqr.f b/lapack-netlib/SRC/sbdsqr.f index c798baaf5..880f0607b 100644 --- a/lapack-netlib/SRC/sbdsqr.f +++ b/lapack-netlib/SRC/sbdsqr.f @@ -277,7 +277,7 @@ $ MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM REAL ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU, $ OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL, - $ SINR, SLL, SMAX, SMIN, SMINL, SMINOA, + $ SINR, SLL, SMAX, SMIN, SMINOA, $ SN, THRESH, TOL, TOLMUL, UNFL * .. * .. External Functions .. @@ -390,7 +390,7 @@ DO 30 I = 1, N - 1 SMAX = MAX( SMAX, ABS( E( I ) ) ) 30 CONTINUE - SMINL = ZERO + SMIN = ZERO IF( TOL.GE.ZERO ) THEN * * Relative accuracy desired @@ -450,7 +450,6 @@ IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH ) $ D( M ) = ZERO SMAX = ABS( D( M ) ) - SMIN = SMAX DO 70 LLL = 1, M - 1 LL = M - LLL ABSS = ABS( D( LL ) ) @@ -459,7 +458,6 @@ $ D( LL ) = ZERO IF( ABSE.LE.THRESH ) $ GO TO 80 - SMIN = MIN( SMIN, ABSS ) SMAX = MAX( SMAX, ABSS, ABSE ) 70 CONTINUE LL = 0 @@ -541,14 +539,14 @@ * apply convergence criterion forward * MU = ABS( D( LL ) ) - SMINL = MU + SMIN = MU DO 100 LLL = LL, M - 1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 100 CONTINUE END IF * @@ -569,14 +567,14 @@ * apply convergence criterion backward * MU = ABS( D( M ) ) - SMINL = MU + SMIN = MU DO 110 LLL = M - 1, LL, -1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 110 CONTINUE END IF END IF @@ -586,7 +584,7 @@ * Compute shift. First, test if shifting would ruin relative * accuracy, and if so set the shift to zero. * - IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE. + IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE. $ MAX( EPS, HNDRTH*TOL ) ) THEN * * Use a zero shift to avoid loss of relative accuracy diff --git a/lapack-netlib/SRC/zbdsqr.f b/lapack-netlib/SRC/zbdsqr.f index 2ad6142a7..faedafc3c 100644 --- a/lapack-netlib/SRC/zbdsqr.f +++ b/lapack-netlib/SRC/zbdsqr.f @@ -259,7 +259,7 @@ $ NM12, NM13, OLDLL, OLDM DOUBLE PRECISION ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU, $ OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL, - $ SINR, SLL, SMAX, SMIN, SMINL, SMINOA, + $ SINR, SLL, SMAX, SMIN, SMINOA, $ SN, THRESH, TOL, TOLMUL, UNFL * .. * .. External Functions .. @@ -372,7 +372,7 @@ DO 30 I = 1, N - 1 SMAX = MAX( SMAX, ABS( E( I ) ) ) 30 CONTINUE - SMINL = ZERO + SMIN = ZERO IF( TOL.GE.ZERO ) THEN * * Relative accuracy desired @@ -426,7 +426,6 @@ IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH ) $ D( M ) = ZERO SMAX = ABS( D( M ) ) - SMIN = SMAX DO 70 LLL = 1, M - 1 LL = M - LLL ABSS = ABS( D( LL ) ) @@ -435,7 +434,6 @@ $ D( LL ) = ZERO IF( ABSE.LE.THRESH ) $ GO TO 80 - SMIN = MIN( SMIN, ABSS ) SMAX = MAX( SMAX, ABSS, ABSE ) 70 CONTINUE LL = 0 @@ -517,14 +515,14 @@ * apply convergence criterion forward * MU = ABS( D( LL ) ) - SMINL = MU + SMIN = MU DO 100 LLL = LL, M - 1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 100 CONTINUE END IF * @@ -545,14 +543,14 @@ * apply convergence criterion backward * MU = ABS( D( M ) ) - SMINL = MU + SMIN = MU DO 110 LLL = M - 1, LL, -1 IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN E( LLL ) = ZERO GO TO 60 END IF MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) ) - SMINL = MIN( SMINL, MU ) + SMIN = MIN( SMIN, MU ) 110 CONTINUE END IF END IF @@ -562,7 +560,7 @@ * Compute shift. First, test if shifting would ruin relative * accuracy, and if so set the shift to zero. * - IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE. + IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE. $ MAX( EPS, HNDRTH*TOL ) ) THEN * * Use a zero shift to avoid loss of relative accuracy From 11d867731ae0c0925276cbcbdaeafa2087cd8061 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 May 2023 19:35:16 +0200 Subject: [PATCH 108/718] Add Apple m1/iphoneos (cross)build in Cirrus CI (#4052) * Add Apple m1/iphoneos (cross)build --- .cirrus.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 7ca1cc7bc..9aeb787b1 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -10,6 +10,18 @@ task: - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - make TARGET=VORTEX USE_OPENMP=1 CC=clang +task: + name: AppleM1/LLVM/CMAKE + compile_script: + - brew install llvm + - export PATH=/opt/homebrew/opt/llvm/bin:$PATH + - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - mkdir build + - cd build + - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. + - make + macos_instance: image: ghcr.io/cirruslabs/macos-monterey-xcode:13.4 task: @@ -38,6 +50,23 @@ task: # path: "libopenblas*" # type: application/octet-streamm +macos_instance: + image: ghcr.io/cirruslabs/macos-monterey-xcode:13.4 +task: + name: AppleM1/LLVM armv8-ios xbuild + compile_script: + - #brew install llvm + - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH + - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - find /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms -name "IPhoneOS*sdk" + - export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS15.5.sdk -arch arm64 -miphoneos-version-min=10.0" + - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 + always: + config_artifacts: + path: "*conf*" + type: text/plain task: name: NeoverseN1 From b1f6d90a114bda8c3d6813737ce9faae3699c43f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 3 Jun 2023 23:21:39 +0200 Subject: [PATCH 109/718] Drop extraneous escaping in the basename call --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 8b633ab8b..aeb9779e4 100755 --- a/c_check +++ b/c_check @@ -35,7 +35,7 @@ if [ "`dirname \"$compiler_name\"`" != '.' ]; then cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/" fi -bn=`basename \"$compiler_name\"` +bn=`basename "$compiler_name"` case "$bn" in *-*) if [ "$bn" != '-' ]; then From 0954746380a54c4f5c45f892dabef6b7c9aa93c3 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Sun, 4 Jun 2023 20:06:58 -0700 Subject: [PATCH 110/718] remove argument unused during compilation. fix wrong vr = VFMVVF_FLOAT(0, vl); --- Makefile.riscv64 | 2 +- kernel/riscv64/symv_L_rvv.c | 4 ---- kernel/riscv64/symv_U_rvv.c | 4 ---- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index d091984a6..ce7a27141 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -3,7 +3,7 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif ifeq ($(CORE), x280) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_GENERIC) diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c index e87ab22ae..b27db2e37 100644 --- a/kernel/riscv64/symv_L_rvv.c +++ b/kernel/riscv64/symv_L_rvv.c @@ -94,7 +94,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA for (k = (m-i); k > 0; k -= vl, i += vl) { vl = VSETVL(k); - vr = VFMVVF_FLOAT(0, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VLEV_FLOAT(&y[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -125,7 +124,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA { vl = VSETVL(k); inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -157,7 +155,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA for (k = (m-i); k > 0; k -= vl, i += vl) { vl = VSETVL(k); - vr = VFMVVF_FLOAT(0, vl); inc_xv = inc_x * vl; va = VLEV_FLOAT(&a_ptr[i], vl); @@ -197,7 +194,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vl = VSETVL(k); inc_xv = inc_x * vl; inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index 3fbc33c89..7e45b1a01 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -95,7 +95,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA for (k = j; k > 0; k -= vl, i += vl) { vl = VSETVL(k); - vr = VFMVVF_FLOAT(0, vl); vy = VLEV_FLOAT(&y[i], vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -125,7 +124,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA { vl = VSETVL(k); inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); @@ -158,7 +156,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA { vl = VSETVL(k); inc_xv = inc_x * vl; - vr = VFMVVF_FLOAT(0, vl); vy = VLEV_FLOAT(&y[i], vl); va = VLEV_FLOAT(&a_ptr[i], vl); @@ -197,7 +194,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vl = VSETVL(k); inc_xv = inc_x * vl; inc_yv = inc_y * vl; - vr = VFMVVF_FLOAT(0, vl); vy = VLSEV_FLOAT(&y[iy], stride_y, vl); va = VLEV_FLOAT(&a_ptr[i], vl); vy = VFMACCVF_FLOAT(vy, temp1, va, vl); From f981e37ddafe1f756d0fe176152d39642f4877e9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 5 Jun 2023 20:34:31 +0200 Subject: [PATCH 111/718] Treat f95 as gfortran --- cmake/fc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index e615e148e..5e32ef129 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -38,7 +38,7 @@ if (${F_COMPILER} STREQUAL "G95") endif () endif () -if (${F_COMPILER} STREQUAL "GFORTRAN") +if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") # ensure reentrancy of lapack codes set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") From 67d1e72e8b8343d8dd83b011234582c0e6009bbb Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 8 Jun 2023 20:20:47 +0800 Subject: [PATCH 112/718] LoongArch64: Add ABI detection for loongarch64 If lp64d ABI is supported, it is used; otherwise, it falls back to the lp64 ABI. --- Makefile.system | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 343b94bb3..7dc17c486 100644 --- a/Makefile.system +++ b/Makefile.system @@ -932,8 +932,12 @@ BINARY_DEFINED = 1 endif ifeq ($(ARCH), loongarch64) -CCOMMON_OPT += -march=loongarch64 -mabi=lp64 -FCOMMON_OPT += -march=loongarch64 -mabi=lp64 +LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) +ifneq ($(LA64_ABI), lp64d) +LA64_ABI=lp64 +endif +CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) +FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) endif endif From 2993a732625fb81b79cab46a39585fdcc4894445 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Jun 2023 18:56:13 +0200 Subject: [PATCH 113/718] Add NVHPC(nvc) --- cmake/cc.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 83b8d15ab..aeaa76710 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -65,6 +65,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI") endif () endif () +if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC") + if (POWER) + set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -tp px") + endif () +endif () + if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE") if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -m64") From f8c8b22740d8e9817e33d73fe4aef766eb1fb8eb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Jun 2023 18:58:52 +0200 Subject: [PATCH 114/718] Add NVHPC(pgf95) --- cmake/fc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 5e32ef129..a67760885 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -121,7 +121,7 @@ if (${F_COMPILER} STREQUAL "IBM") endif () endif () -if (${F_COMPILER} STREQUAL "PGI") +if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI") set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER") if (BINARY64) From d2144b2981a77ec52fc78d9b8d0c6a8f5c08db57 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Jun 2023 19:01:15 +0200 Subject: [PATCH 115/718] Add NVHPC --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 631e7fe69..3dc6c863e 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -55,7 +55,7 @@ if (DEFINED TARGET) endif () # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. -if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI") +if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")) set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") endif () From ee80dd8371f749b2bd38c13f5eca5c7b2963e624 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jun 2023 00:40:47 +0200 Subject: [PATCH 116/718] Ignore LAPACK's la_constants.mod --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0fe20ecaa..1195bc9b8 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ lapack-3.4.2 lapack-3.4.2.tgz lapack-netlib/make.inc lapack-netlib/lapacke/include/lapacke_mangling.h +lapack-netlib/SRC/la_constants.mod lapack-netlib/TESTING/testing_results.txt lapack-netlib/INSTALL/test* lapack-netlib/TESTING/xeigtstc From 58b88aa5f0ffddedec963d067acd102ef413c8e2 Mon Sep 17 00:00:00 2001 From: Manjul Mohan Date: Mon, 12 Jun 2023 01:08:59 -0400 Subject: [PATCH 117/718] POWER10: Fix compiler warnings This patch removes the warning messages related to unused variables in sbgemm_kernel_power10.c. Signed-off-by: Manjul Mohan --- kernel/power/sbgemm_kernel_power10.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index 134929ec1..c3fa67cf6 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -336,7 +336,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (m & 1) { IFLOAT *BO = B; - v2sf_t *rowC; v4sf_t result[4], res[4]; __vector_quad acc0, acc1; __builtin_mma_xxsetaccz (&acc0); @@ -492,7 +491,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[l << 1]); - vec_t *rowB = (vec_t *) & (BO[l]); vector short rowB_mrg = { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); @@ -570,7 +568,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (m & 1) { IFLOAT *BO = B; - v2sf_t *rowC; v4sf_t result[4], res[4]; __vector_quad acc0; BLASLONG l = 0; From ff618ac475bbbc301c64fa85e2f426af2320dc91 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Jun 2023 19:16:06 +0200 Subject: [PATCH 118/718] Rework removal of compiler options --- c_check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_check b/c_check index aeb9779e4..2e0e377bd 100755 --- a/c_check +++ b/c_check @@ -35,12 +35,12 @@ if [ "`dirname \"$compiler_name\"`" != '.' ]; then cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/" fi -bn=`basename "$compiler_name"` +cn= `echo $compiler_name | sed -e 's/ -.*//'` +bn=`basename "$cn"` case "$bn" in *-*) if [ "$bn" != '-' ]; then cross_suffix="$cross_suffix${bn%-*}-" - cross_suffix=`echo $cross_suffix|sed -e 's/ -$//'` fi esac From 369cc0fa025e5ee14f60a61d4b574add7f6dc28e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jun 2023 20:58:23 +0200 Subject: [PATCH 119/718] CirrusCI: Add basic FreeBSD build (#4082) * Add basic FreeBSD build --- .cirrus.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 9aeb787b1..36bfaaa1f 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -82,6 +82,15 @@ task: cpu: 8 compile_script: - make USE_OPENMP=1 + +FreeBSD_task: + freebsd_instance: + image_family: freebsd-13-2 + install_script: + - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc + compile_script: + - ls -l /usr/local/lib + - gmake CC=gcc #task: # name: Windows/LLVM16 --- too slow --- From c94cfef30a236f3fee14882fc453104a50a0a972 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 17 Jun 2023 11:18:21 +0200 Subject: [PATCH 120/718] CirrusCI: Add INTERFACE64 jobs on MacOS, FreeBSD, and on Linux arm64 (#4085) * Add INTERFACE64 jobs on MacOS, FreeBSD, and on Linux arm64 --- .cirrus.yml | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 36bfaaa1f..bef2b1e86 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -10,6 +10,15 @@ task: - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - make TARGET=VORTEX USE_OPENMP=1 CC=clang +task: + name: AppleM1/LLVM/ILP64 + compile_script: + - brew install llvm + - export PATH=/opt/homebrew/opt/llvm/bin:$PATH + - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 + task: name: AppleM1/LLVM/CMAKE compile_script: @@ -74,7 +83,14 @@ task: image: node:latest compile_script: - make - + + task: + name: NeoverseN1-ILP64 + arm_container: + image: node:latest + compile_script: + - make INTERFACE64=1 + task: name: NeoverseN1-OMP arm_container: @@ -84,6 +100,7 @@ task: - make USE_OPENMP=1 FreeBSD_task: + name: FreeBSD-gcc12 freebsd_instance: image_family: freebsd-13-2 install_script: @@ -91,7 +108,18 @@ FreeBSD_task: compile_script: - ls -l /usr/local/lib - gmake CC=gcc - + + +FreeBSD_task: + name: freebsd-gcc12-ilp64 + freebsd_instance: + image_family: freebsd-13-2 + install_script: + - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc + compile_script: + - ls -l /usr/local/lib + - gmake CC=gcc INTERFACE64=1 + #task: # name: Windows/LLVM16 --- too slow --- # windows_container: From c677978789eefd93f87a9e07f8f14743ce3179d9 Mon Sep 17 00:00:00 2001 From: sL1pKn07 Date: Sat, 17 Jun 2023 15:17:45 +0200 Subject: [PATCH 121/718] Install .cmake files in CMAKE_INSTALL_LIBDIR This avoid problems when build and install 64 and 32 bits libs at same time, because both overwrite the .cmake files if install it in an non-architecture depend path like is `share/cmake/foo` is. cmake now pick the correct .cmake files when try to add to another project as depencies --- CMakeLists.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cc964b76e..86758d8b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -321,7 +321,7 @@ if (NOT NOFORTRAN) if (NOT ONLY_CBLAS) # Build test and ctest add_subdirectory(test) - endif() + endif() if (BUILD_TESTING) add_subdirectory(lapack-netlib/TESTING) endif() @@ -438,7 +438,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") endif() if (NOT USE_PERL) add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) @@ -549,9 +549,8 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/ install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) -# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(PN OpenBLAS) -set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}") +set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}") configure_package_config_file(cmake/${PN}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake" INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR}) From c7cf1ae95adc2b0a82dfaf7f5c6be70272579e61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jun 2023 08:46:27 +0200 Subject: [PATCH 122/718] Remove LL/sceil (same as CEILING intrinsic) (Reference-LAPACK PR847) --- lapack-netlib/SRC/VARIANTS/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile index 25d8ee175..35e50cbc2 100644 --- a/lapack-netlib/SRC/VARIANTS/Makefile +++ b/lapack-netlib/SRC/VARIANTS/Makefile @@ -28,7 +28,7 @@ LULL = lu/LL/cgetrf.o lu/LL/dgetrf.o lu/LL/sgetrf.o lu/LL/zgetrf.o LUREC = lu/REC/cgetrf.o lu/REC/dgetrf.o lu/REC/sgetrf.o lu/REC/zgetrf.o -QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o +QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o .PHONY: all From 5c742ad44cad8d613127f470d1cfa56c40567ba6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jun 2023 09:59:44 +0200 Subject: [PATCH 123/718] Fix description (Reference-LAPACK PR847) --- lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f index 16d250c3f..f9384b416 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f @@ -24,7 +24,7 @@ C> \brief \b CPOTRF VARIANT: right looking block version of the algorithm, calli C>\details \b Purpose: C>\verbatim C> -C> CPOTRF computes the Cholesky factorization of a real Hermitian +C> CPOTRF computes the Cholesky factorization of a complex Hermitian C> positive definite matrix A. C> C> The factorization has the form From ce7f4adb4270fe8ccaf28000e2371e9d456b5bdf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jun 2023 10:00:49 +0200 Subject: [PATCH 124/718] Fix description (Reference-LAPACK PR 847) --- lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f index 76d8bab61..bef27afa7 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f @@ -24,7 +24,7 @@ C> \brief \b ZPOTRF VARIANT: right looking block version of the algorithm, calli C>\details \b Purpose: C>\verbatim C> -C> ZPOTRF computes the Cholesky factorization of a real Hermitian +C> ZPOTRF computes the Cholesky factorization of a complex Hermitian C> positive definite matrix A. C> C> The factorization has the form From 8b32cebfcc4d3fb5ac914564991915c5effb4199 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jun 2023 10:02:12 +0200 Subject: [PATCH 125/718] Fix description (Reference-LAPACK PR 847) --- lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f index d6149c08f..62d0a2580 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f @@ -24,7 +24,7 @@ C> \brief \b CPOTRF VARIANT: top-looking block version of the algorithm, calling C>\details \b Purpose: C>\verbatim C> -C> CPOTRF computes the Cholesky factorization of a real symmetric +C> CPOTRF computes the Cholesky factorization of a complex Hermitian C> positive definite matrix A. C> C> The factorization has the form From 35f2ce8340a3186c56debb09b3fa99d85d9af0b8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jun 2023 10:03:43 +0200 Subject: [PATCH 126/718] Fix description (Reference-LAPACK PR 847) --- lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f index 62d0a2580..c810a1533 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f @@ -55,7 +55,7 @@ C> C> \param[in,out] A C> \verbatim C> A is COMPLEX array, dimension (LDA,N) -C> On entry, the symmetric matrix A. If UPLO = 'U', the leading +C> On entry, the Hermitian matrix A. If UPLO = 'U', the leading C> N-by-N upper triangular part of A contains the upper C> triangular part of the matrix A, and the strictly lower C> triangular part of A is not referenced. If UPLO = 'L', the From 2dae4a34a942b44ada32a3c4e6d7db524cfe9001 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jun 2023 10:05:03 +0200 Subject: [PATCH 127/718] Fix description (Reference-LAPACK PR 847) --- lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f index f8b9e253c..449c7ac95 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f @@ -24,7 +24,7 @@ C> \brief \b ZPOTRF VARIANT: top-looking block version of the algorithm, calling C>\details \b Purpose: C>\verbatim C> -C> ZPOTRF computes the Cholesky factorization of a real symmetric +C> ZPOTRF computes the Cholesky factorization of a complex Hermitian C> positive definite matrix A. C> C> The factorization has the form @@ -55,7 +55,7 @@ C> C> \param[in,out] A C> \verbatim C> A is COMPLEX*16 array, dimension (LDA,N) -C> On entry, the symmetric matrix A. If UPLO = 'U', the leading +C> On entry, the Hermitian matrix A. If UPLO = 'U', the leading C> N-by-N upper triangular part of A contains the upper C> triangular part of the matrix A, and the strictly lower C> triangular part of A is not referenced. If UPLO = 'L', the From f330f862f9057ebf7698cb84a1007a65e6c9a038 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jun 2023 10:08:36 +0200 Subject: [PATCH 128/718] Delete and replace with CEILING intrinsic (Reference-LAPACK PR 847) --- lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f | 86 ------------------------ 1 file changed, 86 deletions(-) delete mode 100644 lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f deleted file mode 100644 index a007360ba..000000000 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f +++ /dev/null @@ -1,86 +0,0 @@ -C> \brief \b SCEIL -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* REAL FUNCTION SCEIL( A ) -* -* .. Scalar Arguments .. -* REAL A -* .. -* -* ===================================================================== -* -* .. Intrinsic Functions .. -* INTRINSIC INT -* .. -* .. Executable Statements ..* -* -* IF (A-INT(A).EQ.0) THEN -* SCEIL = A -* ELSE IF (A.GT.0) THEN -* SCEIL = INT(A)+1; -* ELSE -* SCEIL = INT(A) -* END IF -* -* RETURN -* -* END -* Purpose -* ======= -* -C>\details \b Purpose: -C>\verbatim -C>\endverbatim -* -* Arguments: -* ========== -* -* -* Authors: -* ======== -* -C> \author Univ. of Tennessee -C> \author Univ. of California Berkeley -C> \author Univ. of Colorado Denver -C> \author NAG Ltd. -* -C> \date December 2016 -* -C> \ingroup variantsOTHERcomputational -* -* ===================================================================== - REAL FUNCTION SCEIL( A ) -* -* -- LAPACK computational routine -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* -* .. Scalar Arguments ..* - REAL A -* .. -* -* ===================================================================== -* -* .. Intrinsic Functions .. - INTRINSIC INT -* .. -* .. Executable Statements ..* -* - IF (A-INT(A).EQ.0) THEN - SCEIL = A - ELSE IF (A.GT.0) THEN - SCEIL = INT(A)+1; - ELSE - SCEIL = INT(A) - END IF - - RETURN -* - END From f524594d27cb1e9c6ea847f42df260eeb2a34071 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jun 2023 10:11:30 +0200 Subject: [PATCH 129/718] Replace SCEIL with CEILING intrinsic (Reference-LAPACK PR 847) --- lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f | 13 ++++++------- lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f | 11 +++++------ lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f | 11 +++++------ lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f | 13 ++++++------- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f index 46eaf33b9..743731a00 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f @@ -23,7 +23,7 @@ C> \brief \b CGEQRF VARIANT: left-looking Level 3 BLAS version of the algorithm. C>\details \b Purpose: C>\verbatim C> -C> CGEQRF computes a QR factorization of a real M-by-N matrix A: +C> CGEQRF computes a QR factorization of a complex M-by-N matrix A: C> A = Q * R. C> C> This is the left-looking Level 3 BLAS version of the algorithm. @@ -172,12 +172,11 @@ C> EXTERNAL CGEQR2, CLARFB, CLARFT, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC CEILING, MAX, MIN, REAL * .. * .. External Functions .. INTEGER ILAENV - REAL SCEIL - EXTERNAL ILAENV, SCEIL + EXTERNAL ILAENV * .. * .. Executable Statements .. @@ -205,13 +204,13 @@ C> * * So here 4 x 4 is the last T stored in the workspace * - NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB * * optimal workspace = space for dlarfb + space for normal T's + space for the last T * LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) - LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + LLWORK = CEILING(REAL(LLWORK)/REAL(NB)) IF( K.EQ.0 ) THEN @@ -230,7 +229,7 @@ C> ELSE - LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB WORK( 1 ) = LWKOPT diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f index 55cab8b23..bbdd46113 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f @@ -172,12 +172,11 @@ C> EXTERNAL DGEQR2, DLARFB, DLARFT, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC CEILING, MAX, MIN, REAL * .. * .. External Functions .. INTEGER ILAENV - REAL SCEIL - EXTERNAL ILAENV, SCEIL + EXTERNAL ILAENV * .. * .. Executable Statements .. @@ -205,13 +204,13 @@ C> * * So here 4 x 4 is the last T stored in the workspace * - NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB * * optimal workspace = space for dlarfb + space for normal T's + space for the last T * LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) - LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + LLWORK = CEILING(REAL(LLWORK)/REAL(NB)) IF( K.EQ.0 ) THEN @@ -230,7 +229,7 @@ C> ELSE - LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB WORK( 1 ) = LWKOPT diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f index d2ad13ced..bf68d635b 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f @@ -172,12 +172,11 @@ C> EXTERNAL SGEQR2, SLARFB, SLARFT, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC CEILING, MAX, MIN, REAL * .. * .. External Functions .. INTEGER ILAENV - REAL SCEIL - EXTERNAL ILAENV, SCEIL + EXTERNAL ILAENV * .. * .. Executable Statements .. @@ -205,13 +204,13 @@ C> * * So here 4 x 4 is the last T stored in the workspace * - NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB * * optimal workspace = space for dlarfb + space for normal T's + space for the last T * LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) - LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + LLWORK = CEILING(REAL(LLWORK)/REAL(NB)) IF( K.EQ.0 ) THEN @@ -230,7 +229,7 @@ C> ELSE - LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB WORK( 1 ) = LWKOPT diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f index 623b88a8a..06918568e 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f @@ -23,7 +23,7 @@ C> \brief \b ZGEQRF VARIANT: left-looking Level 3 BLAS of the algorithm. C>\details \b Purpose: C>\verbatim C> -C> ZGEQRF computes a QR factorization of a real M-by-N matrix A: +C> ZGEQRF computes a QR factorization of a complex M-by-N matrix A: C> A = Q * R. C> C> This is the left-looking Level 3 BLAS version of the algorithm. @@ -172,12 +172,11 @@ C> EXTERNAL ZGEQR2, ZLARFB, ZLARFT, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC CEILING, MAX, MIN, REAL * .. * .. External Functions .. INTEGER ILAENV - REAL SCEIL - EXTERNAL ILAENV, SCEIL + EXTERNAL ILAENV * .. * .. Executable Statements .. @@ -205,13 +204,13 @@ C> * * So here 4 x 4 is the last T stored in the workspace * - NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB + NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB * * optimal workspace = space for dlarfb + space for normal T's + space for the last T * LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) - LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) + LLWORK = CEILING(REAL(LLWORK)/REAL(NB)) IF( K.EQ.0 ) THEN @@ -230,7 +229,7 @@ C> ELSE - LBWORK = SCEIL(REAL(K)/REAL(NB))*NB + LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB WORK( 1 ) = LWKOPT From 0e55702b80bde454dfd109149ef50f35ef0ff085 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jun 2023 11:17:21 +0200 Subject: [PATCH 130/718] Use dynamic allocation in ?SYL01 tests (Reference-LAPACK PR 854) --- lapack-netlib/TESTING/EIG/csyl01.f | 33 +++++++++++++++++++++++----- lapack-netlib/TESTING/EIG/dsyl01.f | 35 +++++++++++++++++++++++++----- lapack-netlib/TESTING/EIG/ssyl01.f | 35 +++++++++++++++++++++++++----- lapack-netlib/TESTING/EIG/zsyl01.f | 33 +++++++++++++++++++++++----- 4 files changed, 114 insertions(+), 22 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f index 82d790daa..8a3cd1ae5 100644 --- a/lapack-netlib/TESTING/EIG/csyl01.f +++ b/lapack-netlib/TESTING/EIG/csyl01.f @@ -120,14 +120,16 @@ COMPLEX RMUL * .. * .. Local Arrays .. - COMPLEX A( MAXM, MAXM ), B( MAXN, MAXN ), - $ C( MAXM, MAXN ), CC( MAXM, MAXN ), - $ X( MAXM, MAXN ), - $ DUML( MAXM ), DUMR( MAXN ), + COMPLEX DUML( MAXM ), DUMR( MAXN ), $ D( MAX( MAXM, MAXN ) ) - REAL SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 ) + REAL DUM( MAXN ), VM( 2 ) INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) * .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X + REAL, DIMENSION(:,:), ALLOCATABLE :: SWORK +* .. * .. External Functions .. LOGICAL SISNAN REAL SLAMCH, CLANGE @@ -139,6 +141,20 @@ * .. Intrinsic Functions .. INTRINSIC ABS, REAL, MAX * .. +* .. Allocate memory dynamically .. + ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( SWORK( LDSWORK, 54 ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * * Get machine parameters @@ -286,6 +302,13 @@ END DO END DO END DO +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (CC, STAT = AllocateStatus) + DEALLOCATE (X, STAT = AllocateStatus) + DEALLOCATE (SWORK, STAT = AllocateStatus) * RETURN * diff --git a/lapack-netlib/TESTING/EIG/dsyl01.f b/lapack-netlib/TESTING/EIG/dsyl01.f index 782d2cd42..0ea481382 100644 --- a/lapack-netlib/TESTING/EIG/dsyl01.f +++ b/lapack-netlib/TESTING/EIG/dsyl01.f @@ -117,13 +117,15 @@ $ SCALE, SCALE3, SMLNUM, TNRM, XNRM * .. * .. Local Arrays .. - DOUBLE PRECISION A( MAXM, MAXM ), B( MAXN, MAXN ), - $ C( MAXM, MAXN ), CC( MAXM, MAXN ), - $ X( MAXM, MAXN ), - $ DUML( MAXM ), DUMR( MAXN ), + DOUBLE PRECISION DUML( MAXM ), DUMR( MAXN ), $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), - $ SWORK( LDSWORK, 126 ), VM( 2 ) - INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) + $ VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X, + $ SWORK * .. * .. External Functions .. LOGICAL DISNAN @@ -136,6 +138,20 @@ * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX * .. +* .. Allocate memory dynamically .. + ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( SWORK( LDSWORK, 126 ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * * Get machine parameters @@ -280,6 +296,13 @@ END DO END DO END DO +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (CC, STAT = AllocateStatus) + DEALLOCATE (X, STAT = AllocateStatus) + DEALLOCATE (SWORK, STAT = AllocateStatus) * RETURN * diff --git a/lapack-netlib/TESTING/EIG/ssyl01.f b/lapack-netlib/TESTING/EIG/ssyl01.f index 22d089dc8..fda30a3c0 100644 --- a/lapack-netlib/TESTING/EIG/ssyl01.f +++ b/lapack-netlib/TESTING/EIG/ssyl01.f @@ -117,13 +117,15 @@ $ SCALE, SCALE3, SMLNUM, TNRM, XNRM * .. * .. Local Arrays .. - REAL A( MAXM, MAXM ), B( MAXN, MAXN ), - $ C( MAXM, MAXN ), CC( MAXM, MAXN ), - $ X( MAXM, MAXN ), - $ DUML( MAXM ), DUMR( MAXN ), + REAL DUML( MAXM ), DUMR( MAXN ), $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), - $ SWORK( LDSWORK, 54 ), VM( 2 ) - INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) + $ VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X, + $ SWORK * .. * .. External Functions .. LOGICAL SISNAN @@ -136,6 +138,20 @@ * .. Intrinsic Functions .. INTRINSIC ABS, REAL, MAX * .. +* .. Allocate memory dynamically .. + ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( SWORK( LDSWORK, 54 ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * * Get machine parameters @@ -280,6 +296,13 @@ END DO END DO END DO +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (CC, STAT = AllocateStatus) + DEALLOCATE (X, STAT = AllocateStatus) + DEALLOCATE (SWORK, STAT = AllocateStatus) * RETURN * diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f index 329f39dc4..5d26d494c 100644 --- a/lapack-netlib/TESTING/EIG/zsyl01.f +++ b/lapack-netlib/TESTING/EIG/zsyl01.f @@ -120,14 +120,16 @@ COMPLEX*16 RMUL * .. * .. Local Arrays .. - COMPLEX*16 A( MAXM, MAXM ), B( MAXN, MAXN ), - $ C( MAXM, MAXN ), CC( MAXM, MAXN ), - $ X( MAXM, MAXN ), - $ DUML( MAXM ), DUMR( MAXN ), + COMPLEX*16 DUML( MAXM ), DUMR( MAXN ), $ D( MAX( MAXM, MAXN ) ) - DOUBLE PRECISION SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 ) + DOUBLE PRECISION DUM( MAXN ), VM( 2 ) INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) * .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: SWORK +* .. * .. External Functions .. LOGICAL DISNAN DOUBLE PRECISION DLAMCH, ZLANGE @@ -139,6 +141,20 @@ * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, SQRT * .. +* .. Allocate memory dynamically .. + ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( SWORK( LDSWORK, 103 ), STAT = AllocateStatus ) + IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * * Get machine parameters @@ -286,6 +302,13 @@ END DO END DO END DO +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (CC, STAT = AllocateStatus) + DEALLOCATE (X, STAT = AllocateStatus) + DEALLOCATE (SWORK, STAT = AllocateStatus) * RETURN * From 73b30b1dec1d0e9e50e3486854574b5f6097cefd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jun 2023 17:46:29 +0200 Subject: [PATCH 131/718] Fix VLEV_FLOAT/VSEV_FLOAT macros to compile with t-head 2.6.1 --- kernel/riscv64/amax_vector.c | 8 ++++---- kernel/riscv64/amin_vector.c | 8 ++++---- kernel/riscv64/asum_vector.c | 8 ++++---- kernel/riscv64/axpby_vector.c | 16 ++++++++-------- kernel/riscv64/axpy_vector.c | 16 ++++++++-------- kernel/riscv64/copy_vector.c | 16 ++++++++-------- kernel/riscv64/dot_vector.c | 8 ++++---- kernel/riscv64/gemv_n_vector.c | 16 ++++++++-------- kernel/riscv64/gemv_t_vector.c | 8 ++++---- kernel/riscv64/iamax_vector.c | 8 ++++---- kernel/riscv64/iamin_vector.c | 8 ++++---- kernel/riscv64/imax_vector.c | 8 ++++---- kernel/riscv64/imin_vector.c | 8 ++++---- kernel/riscv64/izamax_vector.c | 4 ++-- kernel/riscv64/izamin_vector.c | 4 ++-- kernel/riscv64/max_vector.c | 8 ++++---- kernel/riscv64/min_vector.c | 8 ++++---- kernel/riscv64/nrm2_vector.c | 8 ++++---- kernel/riscv64/rot_vector.c | 16 ++++++++-------- kernel/riscv64/scal_vector.c | 16 ++++++++-------- kernel/riscv64/swap_vector.c | 16 ++++++++-------- kernel/riscv64/symv_L_vector.c | 16 ++++++++-------- kernel/riscv64/symv_U_vector.c | 16 ++++++++-------- kernel/riscv64/zamax_vector.c | 4 ++-- kernel/riscv64/zamin_vector.c | 4 ++-- kernel/riscv64/zasum_vector.c | 8 ++++---- kernel/riscv64/zaxpby_vector.c | 8 ++++---- kernel/riscv64/zaxpy_vector.c | 8 ++++---- kernel/riscv64/zcopy_vector.c | 8 ++++---- kernel/riscv64/zdot_vector.c | 8 ++++---- kernel/riscv64/zgemv_n_vector.c | 16 ++++++++-------- kernel/riscv64/zgemv_t_vector.c | 4 ++-- kernel/riscv64/zhemv_LM_vector.c | 8 ++++---- kernel/riscv64/zhemv_UV_vector.c | 8 ++++---- kernel/riscv64/znrm2_vector.c | 8 ++++---- kernel/riscv64/zrot_vector.c | 16 ++++++++-------- kernel/riscv64/zscal_vector.c | 8 ++++---- kernel/riscv64/zswap_vector.c | 16 ++++++++-------- 38 files changed, 190 insertions(+), 190 deletions(-) diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index b778d3e55..1b7799340 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index fd2f83dc9..f9b7defae 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index a82275153..fc73362bc 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index 988c57ec2..676dfd474 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 98b9f6814..6f921f2d6 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -30,18 +30,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c index a46136d6c..fee5e195d 100644 --- a/kernel/riscv64/copy_vector.c +++ b/kernel/riscv64/copy_vector.c @@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c index 64efc6c40..f47e0c0b5 100644 --- a/kernel/riscv64/dot_vector.c +++ b/kernel/riscv64/dot_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -45,8 +45,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index 32ca8618b..bb9ab8e5a 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c index 7683641fa..7d0b70cbb 100644 --- a/kernel/riscv64/gemv_t_vector.c +++ b/kernel/riscv64/gemv_t_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -46,8 +46,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c index ecb4cd7a9..9fea522f7 100644 --- a/kernel/riscv64/iamax_vector.c +++ b/kernel/riscv64/iamax_vector.c @@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -60,8 +60,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index c72bb94cc..4e81e7848 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index c2d787ab8..ca48a3c48 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 @@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index dfe9a3310..2a677098d 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 @@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index fdbdc3ae8..66a101566 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 59c720310..818193a9e 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 @@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index b988513c9..7f31e9a53 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 @@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index be0803df6..14b7e01ed 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 @@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c index 2a83e2a52..cf6fdb741 100644 --- a/kernel/riscv64/nrm2_vector.c +++ b/kernel/riscv64/nrm2_vector.c @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32 #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -55,8 +55,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64 #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c index 9b48d1c69..43a65e552 100644 --- a/kernel/riscv64/rot_vector.c +++ b/kernel/riscv64/rot_vector.c @@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFMSACVF_FLOAT vfmsac_vf_f32m4 @@ -42,10 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFMSACVF_FLOAT vfmsac_vf_f64m4 diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c index 7a3153b7c..8b9ef5a3e 100644 --- a/kernel/riscv64/scal_vector.c +++ b/kernel/riscv64/scal_vector.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #define VFMULVF_FLOAT vfmul_vf_f32m8 #define VFMVVF_FLOAT vfmv_v_f_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #define VFMULVF_FLOAT vfmul_vf_f64m8 #define VFMVVF_FLOAT vfmv_v_f_f64m8 #endif diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index d9421e2f1..b16592808 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c index 6588f4dda..58ec17b03 100644 --- a/kernel/riscv64/symv_L_vector.c +++ b/kernel/riscv64/symv_L_vector.c @@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,10 +48,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 31104eae6..34ff0e30a 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 9dbeba90f..bfb282ae0 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index dc58075ac..d9eca7f10 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c index 8386ab62e..0d1cc42f1 100644 --- a/kernel/riscv64/zasum_vector.c +++ b/kernel/riscv64/zasum_vector.c @@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1 #define MASK_T vbool4_t #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 @@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index 3eca20415..5e6034ac5 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 @@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 303d3541e..4ccfe4a81 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 #endif diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c index 600f02bba..55a480a35 100644 --- a/kernel/riscv64/zcopy_vector.c +++ b/kernel/riscv64/zcopy_vector.c @@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #endif diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index ec38ed9d2..0900206b3 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index b5ee1f054..3095c28f9 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -29,19 +29,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 #endif diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c index e930dc2a2..a7a8a5279 100644 --- a/kernel/riscv64/zgemv_t_vector.c +++ b/kernel/riscv64/zgemv_t_vector.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c index 275ee9131..0a284a999 100644 --- a/kernel/riscv64/zhemv_LM_vector.c +++ b/kernel/riscv64/zhemv_LM_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 2f46977d4..33b7c9c25 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index 59d0e219d..cadabdb75 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT vfmacc_vv_f32m4 #define VFMVVF_FLOAT vfmv_v_f_f32m4 @@ -53,8 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT vfmacc_vv_f64m4 #define VFMVVF_FLOAT vfmv_v_f_f64m4 diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c index 2fdd8135a..858dfd173 100644 --- a/kernel/riscv64/zrot_vector.c +++ b/kernel/riscv64/zrot_vector.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle_v_f32m4 -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSEV_FLOAT vse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLEV_FLOAT vle32_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSEV_FLOAT vse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 @@ -41,10 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle_v_f64m4 -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSEV_FLOAT vse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLEV_FLOAT vle64_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSEV_FLOAT vse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 64323aa3a..d275b75f8 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m4(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT vlse_v_f32m4 -#define VSSEV_FLOAT vsse_v_f32m4 +#define VLSEV_FLOAT vlse32_v_f32m4 +#define VSSEV_FLOAT vsse32_v_f32m4 #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 @@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT vlse_v_f64m4 -#define VSSEV_FLOAT vsse_v_f64m4 +#define VLSEV_FLOAT vlse64_v_f64m4 +#define VSSEV_FLOAT vsse64_v_f64m4 #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c index 7550294b5..c1dcaccab 100644 --- a/kernel/riscv64/zswap_vector.c +++ b/kernel/riscv64/zswap_vector.c @@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t -#define VLEV_FLOAT vle_v_f32m8 -#define VLSEV_FLOAT vlse_v_f32m8 -#define VSEV_FLOAT vse_v_f32m8 -#define VSSEV_FLOAT vsse_v_f32m8 +#define VLEV_FLOAT vle32_v_f32m8 +#define VLSEV_FLOAT vlse32_v_f32m8 +#define VSEV_FLOAT vse32_v_f32m8 +#define VSSEV_FLOAT vsse32_v_f32m8 #else #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t -#define VLEV_FLOAT vle_v_f64m8 -#define VLSEV_FLOAT vlse_v_f64m8 -#define VSEV_FLOAT vse_v_f64m8 -#define VSSEV_FLOAT vsse_v_f64m8 +#define VLEV_FLOAT vle64_v_f64m8 +#define VLSEV_FLOAT vlse64_v_f64m8 +#define VSEV_FLOAT vse64_v_f64m8 +#define VSSEV_FLOAT vsse64_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) From 655058dd2177ce07d3a250568992473fd7347683 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Jun 2023 14:58:35 +0200 Subject: [PATCH 132/718] fix spurious compiler error and more extraneous quotes --- c_check | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/c_check b/c_check index 2e0e377bd..7c8494e4a 100755 --- a/c_check +++ b/c_check @@ -31,11 +31,11 @@ flags="$*" cross_suffix="" -if [ "`dirname \"$compiler_name\"`" != '.' ]; then - cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/" +if [ "`dirname "$compiler_name"`" != '.' ]; then + cross_suffix="$cross_suffix`dirname "$compiler_name"`/" fi -cn= `echo $compiler_name | sed -e 's/ -.*//'` +cn=`echo $compiler_name | sed -e 's/ -.*//'` bn=`basename "$cn"` case "$bn" in From 8d57af540b12dc864d49dc45ebd8dc449b10e41b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 10:42:46 +0200 Subject: [PATCH 133/718] Add Dynamic Mode Decomposition functions (Reference-LAPACK PR 736) --- lapack-netlib/SRC/cgedmd.f90 | 995 +++++++++++++++++++++++++++++++ lapack-netlib/SRC/cgedmdq.f90 | 689 +++++++++++++++++++++ lapack-netlib/SRC/dgedmd.f90 | 1054 +++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dgedmdq.f90 | 704 ++++++++++++++++++++++ lapack-netlib/SRC/sgedmd.f90 | 1054 +++++++++++++++++++++++++++++++++ lapack-netlib/SRC/sgedmdq.f90 | 703 ++++++++++++++++++++++ lapack-netlib/SRC/zgedmd.f90 | 996 +++++++++++++++++++++++++++++++ lapack-netlib/SRC/zgedmdq.f90 | 689 +++++++++++++++++++++ 8 files changed, 6884 insertions(+) create mode 100644 lapack-netlib/SRC/cgedmd.f90 create mode 100644 lapack-netlib/SRC/cgedmdq.f90 create mode 100644 lapack-netlib/SRC/dgedmd.f90 create mode 100644 lapack-netlib/SRC/dgedmdq.f90 create mode 100644 lapack-netlib/SRC/sgedmd.f90 create mode 100644 lapack-netlib/SRC/sgedmdq.f90 create mode 100644 lapack-netlib/SRC/zgedmd.f90 create mode 100644 lapack-netlib/SRC/zgedmdq.f90 diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90 new file mode 100644 index 000000000..499489270 --- /dev/null +++ b/lapack-netlib/SRC/cgedmd.f90 @@ -0,0 +1,995 @@ + SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, EIGS, Z, LDZ, RES, B, LDB, & + W, LDW, S, LDS, ZWORK, LZWORK, & + RWORK, LRWORK, IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LIWORK, LRWORK, LZWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: RWORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!............................................................ +! Purpose +! ======= +! CGEDMD computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, CGEDMD computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, CGEDMD returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +!...................................................................... +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!...................................................................... +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product X(:,1:K)*W, where X +! contains a POD basis (leading left singular vectors +! of the data matrix X) and W contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of K, X, W, Z. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will be +! computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: CGESVD (the QR SVD algorithm) +! 2 :: CGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M>= 0 +! The state space dimension (the row dimension of X, Y). +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshot pairs +! (the number of columns of X and Y). +!..... +! X (input/output) COMPLEX(KIND=WP) M-by-N array +! > On entry, X contains the data snapshot matrix X. It is +! assumed that the column norms of X are in the range of +! the normalized floating point numbers. +! < On exit, the leading K columns of X contain a POD basis, +! i.e. the leading K left singular vectors of the input +! data matrix X, U(:,1:K). All N columns of X contain all +! left singular vectors of the input matrix X. +! See the descriptions of K, Z and W. +!..... +! LDX (input) INTEGER, LDX >= M +! The leading dimension of the array X. +!..... +! Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +! > On entry, Y contains the data snapshot matrix Y +! < On exit, +! If JOBR == 'R', the leading K columns of Y contain +! the residual vectors for the computed Ritz pairs. +! See the description of RES. +! If JOBR == 'N', Y contains the original input data, +! scaled according to the value of JOBS. +!..... +! LDY (input) INTEGER , LDY >= M +! The leading dimension of the array Y. +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the descriptions of TOL and K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the POD basis for the data snapshot +! matrix X and the number of the computed Ritz pairs. +! The value of K is determined according to the rule set +! by the parameters NRNK and TOL. +! See the descriptions of NRNK and TOL. +!..... +! EIGS (output) COMPLEX(KIND=WP) N-by-1 array +! The leading K (K<=N) entries of EIGS contain +! the computed eigenvalues (Ritz values). +! See the descriptions of K, and Z. +!..... +! Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +! the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +! is an eigenvector corresponding to EIGS(i). The columns +! of W(1:k,1:K) are the computed eigenvectors of the +! K-by-K Rayleigh quotient. +! See the descriptions of EIGS, X and W. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) N-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs, +! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +! See the description of EIGS and Z. +!..... +! B (output) COMPLEX(KIND=WP) M-by-N array. +! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:M,1:K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! If JOBF =='N', then B is not referenced. +! See the descriptions of X, W, K. +!..... +! LDB (input) INTEGER, LDB >= M +! The leading dimension of the array B. +!..... +! W (workspace/output) COMPLEX(KIND=WP) N-by-N array +! On exit, W(1:K,1:K) contains the K computed +! eigenvectors of the matrix Rayleigh quotient. +! The Ritz vectors (returned in Z) are the +! product of X (containing a POD basis for the input +! matrix X) and W. See the descriptions of K, S, X and Z. +! W is also used as a workspace to temporarily store the +! right singular vectors of X. +!..... +! LDW (input) INTEGER, LDW >= N +! The leading dimension of the array W. +!..... +! S (workspace/output) COMPLEX(KIND=WP) N-by-N array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by CGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N +! The leading dimension of the array S. +!..... +! ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +! ZWORK is used as complex workspace in the complex SVD, as +! specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing +! the eigenvalues of a Rayleigh quotient. +! If the call to CGEDMD is only workspace query, then +! ZWORK(1) contains the minimal complex workspace length and +! ZWORK(2) is the optimal complex workspace length. +! Hence, the length of work is at least 2. +! See the description of LZWORK. +!..... +! LZWORK (input) INTEGER +! The minimal length of the workspace vector ZWORK. +! LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), +! where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal +! LZWORK_SVD is calculated as follows +! If WHTSVD == 1 :: CGESVD :: +! LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +! If WHTSVD == 2 :: CGESDD :: +! LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +! If WHTSVD == 3 :: CGESVDQ :: +! LZWORK_SVD = obtainable by a query +! If WHTSVD == 4 :: CGEJSV :: +! LZWORK_SVD = obtainable by a query +! If on entry LZWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths and returns them in +! LZWORK(1) and LZWORK(2), respectively. +!..... +! RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +! On exit, RWORK(1:N) contains the singular values of +! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +! If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +! scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +! and Y to avoid overflow in the SVD of X. +! This may be of interest if the scaling option is off +! and as many as possible smallest eigenvalues are +! desired to the highest feasible accuracy. +! If the call to CGEDMD is only workspace query, then +! RWORK(1) contains the minimal workspace length. +! See the description of LRWORK. +!..... +! LRWORK (input) INTEGER +! The minimal length of the workspace vector RWORK. +! LRWORK is calculated as follows: +! LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where +! LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +! for the SVD subroutine determined by the input parameter +! WHTSVD. +! If WHTSVD == 1 :: CGESVD :: +! LRWORK_SVD = 5*MIN(M,N) +! If WHTSVD == 2 :: CGESDD :: +! LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +! 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +! If WHTSVD == 3 :: CGESVDQ :: +! LRWORK_SVD = obtainable by a query +! If WHTSVD == 4 :: CGEJSV :: +! LRWORK_SVD = obtainable by a query +! If on entry LRWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! real workspace length and returns it in RWORK(1). +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +! If on entry LIWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for ZWORK, RWORK and +! IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) + +! Local scalars +! ~~~~~~~~~~~~~ + REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & + SSUM, XSCL1, XSCL2 + INTEGER :: i, j, IMINWR, INFO1, INFO2, & + LWRKEV, LWRSDD, LWRSVD, LWRSVJ, & + LWRSVQ, MLWORK, MWRKEV, MWRSDD, & + MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, & + OLWORK, MLRWRK + LOGICAL :: BADXY, LQUERY, SCCOLX, SCCOLY, & + WNTEX, WNTREF, WNTRES, WNTVEC + CHARACTER :: JOBZL, T_OR_N + CHARACTER :: JSVOPT +! +! Local arrays +! ~~~~~~~~~~~~ + REAL(KIND=WP) :: RDUMMY(2) + +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2 + EXTERNAL CLANGE, SLAMCH, SCNRM2, ICAMAX + INTEGER ICAMAX + LOGICAL SISNAN, LSAME + EXTERNAL SISNAN, LSAME + +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL CAXPY, CGEMM, CSSCAL + EXTERNAL CGEEV, CGEJSV, CGESDD, CGESVD, CGESVDQ, & + CLACPY, CLASCL, CLASSQ, XERBLA + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC FLOAT, INT, MAX, SQRT +!............................................................ +! +! Test the input arguments +! + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C') + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + INFO = 0 + LQUERY = ( ( LZWORK == -1 ) .OR. ( LIWORK == -1 ) & + .OR. ( LRWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N') & + .OR. LSAME(JOBZ,'F')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. (.NOT.WNTVEC) ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -4 + ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR. & + (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN + INFO = -5 + ELSE IF ( M < 0 ) THEN + INFO = -6 + ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN + INFO = -7 + ELSE IF ( LDX < M ) THEN + INFO = -9 + ELSE IF ( LDY < M ) THEN + INFO = -11 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -12 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -13 + ELSE IF ( LDZ < M ) THEN + INFO = -17 + ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN + INFO = -20 + ELSE IF ( LDW < N ) THEN + INFO = -22 + ELSE IF ( LDS < N ) THEN + INFO = -24 + END IF +! + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( N == 0 ) THEN + ! Quick return. All output except K is void. + ! INFO=1 signals the void input. + ! In case of a workspace query, the default + ! minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + RWORK(1) = 1 + ZWORK(1) = 2 + ZWORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + + IMINWR = 1 + MLRWRK = MAX(1,N) + MLWORK = 2 + OLWORK = 2 + SELECT CASE ( WHTSVD ) + CASE (1) + ! The following is specified as the minimal + ! length of WORK in the definition of CGESVD: + ! MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) + MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) + MLWORK = MAX(MLWORK,MWRSVD) + MLRWRK = MAX(MLRWRK,N + 5*MIN(M,N)) + IF ( LQUERY ) THEN + CALL CGESVD( 'O', 'S', M, N, X, LDX, RWORK, & + B, LDB, W, LDW, ZWORK, -1, RDUMMY, INFO1 ) + LWRSVD = INT( ZWORK(1) ) + OLWORK = MAX(OLWORK,LWRSVD) + END IF + CASE (2) + ! The following is specified as the minimal + ! length of WORK in the definition of CGESDD: + ! MWRSDD = 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N). + ! RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) + ! In LAPACK 3.10.1 RWORK is defined differently. + ! Below we take max over the two versions. + ! IMINWR = 8*MIN(M,N) + MWRSDD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) + MLWORK = MAX(MLWORK,MWRSDD) + IMINWR = 8*MIN(M,N) + MLRWRK = MAX( MLRWRK, N + & + MAX( 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), & + 5*MIN(M,N)*MIN(M,N)+5*MIN(M,N), & + 2*MAX(M,N)*MIN(M,N)+ & + 2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) + IF ( LQUERY ) THEN + CALL CGESDD( 'O', M, N, X, LDX, RWORK, B, & + LDB, W, LDW, ZWORK, -1, RDUMMY, IWORK, INFO1 ) + LWRSDD = MAX(MWRSDD,INT( ZWORK(1) )) + OLWORK = MAX(OLWORK,LWRSDD) + END IF + CASE (3) + CALL CGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, RWORK, Z, LDZ, W, LDW, NUMRNK, & + IWORK, -1, ZWORK, -1, RDUMMY, -1, INFO1 ) + IMINWR = IWORK(1) + MWRSVQ = INT(ZWORK(2)) + MLWORK = MAX(MLWORK,MWRSVQ) + MLRWRK = MAX(MLRWRK,N + INT(RDUMMY(1))) + IF ( LQUERY ) THEN + LWRSVQ = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,LWRSVQ) + END IF + CASE (4) + JSVOPT = 'J' + CALL CGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, & + N, X, LDX, RWORK, Z, LDZ, W, LDW, & + ZWORK, -1, RDUMMY, -1, IWORK, INFO1 ) + IMINWR = IWORK(1) + MWRSVJ = INT(ZWORK(2)) + MLWORK = MAX(MLWORK,MWRSVJ) + MLRWRK = MAX(MLRWRK,N + MAX(7,INT(RDUMMY(1)))) + IF ( LQUERY ) THEN + LWRSVJ = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,LWRSVJ) + END IF + END SELECT + IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN + JOBZL = 'V' + ELSE + JOBZL = 'N' + END IF + ! Workspace calculation to the CGEEV call + MWRKEV = MAX( 1, 2*N ) + MLWORK = MAX(MLWORK,MWRKEV) + MLRWRK = MAX(MLRWRK,N+2*N) + IF ( LQUERY ) THEN + CALL CGEEV( 'N', JOBZL, N, S, LDS, EIGS, & + W, LDW, W, LDW, ZWORK, -1, RWORK, INFO1 ) ! LAPACK CALL + LWRKEV = INT(ZWORK(1)) + OLWORK = MAX( OLWORK, LWRKEV ) + OLWORK = MAX( 2, OLWORK ) + END IF +! + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -30 + IF ( LRWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -28 + IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -26 + + END IF +! + IF( INFO /= 0 ) THEN + CALL XERBLA( 'CGEDMD', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + RWORK(1) = MLRWRK + ZWORK(1) = MLWORK + ZWORK(2) = OLWORK + RETURN + END IF +!............................................................ +! + OFL = SLAMCH('O')*SLAMCH('P') + SMALL = SLAMCH('S') + BADXY = .FALSE. +! +! <1> Optional scaling of the snapshots (columns of X, Y) +! ========================================================== + IF ( SCCOLX ) THEN + ! The columns of X will be normalized. + ! To prevent overflows, the column norms of X are + ! carefully computed using CLASSQ. + K = 0 + DO i = 1, N + !WORK(i) = SCNRM2( M, X(1,i), 1 ) + SCALE = ZERO + CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM ) + IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN + K = 0 + INFO = -8 + CALL XERBLA('CGEDMD',-INFO) + END IF + IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of X(:,i) overflows. First, X(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. +! Next, the norm of X(:,i) is stored without +! overflow as WORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of X(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL CLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, X(1,i), LDX, INFO2 ) + RWORK(i) = - SCALE * ( ROOTSC / FLOAT(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + RWORK(i) = SCALE * ROOTSC + CALL CLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, & + X(1,i), LDX, INFO2 ) ! LAPACK CALL +! X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC + END IF + ELSE + RWORK(i) = ZERO + K = K + 1 + END IF + END DO + IF ( K == N ) THEN + ! All columns of X are zero. Return error code -8. + ! (the 8th input variable had an illegal value) + K = 0 + INFO = -8 + CALL XERBLA('CGEDMD',-INFO) + RETURN + END IF + DO i = 1, N +! Now, apply the same scaling to the columns of Y. + IF ( RWORK(i) > ZERO ) THEN + CALL CSSCAL( M, ONE/RWORK(i), Y(1,i), 1 ) ! BLAS CALL +! Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC + ELSE IF ( RWORK(i) < ZERO ) THEN + CALL CLASCL( 'G', 0, 0, -RWORK(i), & + ONE/FLOAT(M), M, 1, Y(1,i), LDY, INFO2 ) ! LAPACK CALL + ELSE IF ( ABS(Y(ICAMAX(M, Y(1,i),1),i )) & + /= ZERO ) THEN +! X(:,i) is zero vector. For consistency, +! Y(:,i) should also be zero. If Y(:,i) is not +! zero, then the data might be inconsistent or +! corrupted. If JOBS == 'C', Y(:,i) is set to +! zero and a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + IF ( LSAME(JOBS,'C')) & + CALL CSSCAL( M, ZERO, Y(1,i), 1 ) ! BLAS CALL + END IF + END DO + END IF + ! + IF ( SCCOLY ) THEN + ! The columns of Y will be normalized. + ! To prevent overflows, the column norms of Y are + ! carefully computed using CLASSQ. + DO i = 1, N + !RWORK(i) = SCNRM2( M, Y(1,i), 1 ) + SCALE = ZERO + CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM ) + IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN + K = 0 + INFO = -10 + CALL XERBLA('CGEDMD',-INFO) + END IF + IF ( SCALE /= ZERO .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of Y(:,i) overflows. First, Y(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. +! Next, the norm of Y(:,i) is stored without +! overflow as RWORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of Y(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL CLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, Y(1,i), LDY, INFO2 ) + RWORK(i) = - SCALE * ( ROOTSC / FLOAT(M) ) + ELSE +! Y(:,i) will be scaled to unit 2-norm + RWORK(i) = SCALE * ROOTSC + CALL CLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, & + Y(1,i), LDY, INFO2 ) ! LAPACK CALL +! Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC + END IF + ELSE + RWORK(i) = ZERO + END IF + END DO + DO i = 1, N +! Now, apply the same scaling to the columns of X. + IF ( RWORK(i) > ZERO ) THEN + CALL CSSCAL( M, ONE/RWORK(i), X(1,i), 1 ) ! BLAS CALL +! X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC + ELSE IF ( RWORK(i) < ZERO ) THEN + CALL CLASCL( 'G', 0, 0, -RWORK(i), & + ONE/FLOAT(M), M, 1, X(1,i), LDX, INFO2 ) ! LAPACK CALL + ELSE IF ( ABS(X(ICAMAX(M, X(1,i),1),i )) & + /= ZERO ) THEN +! Y(:,i) is zero vector. If X(:,i) is not +! zero, then a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + END IF + END DO + END IF +! +! <2> SVD of the data snapshot matrix X. +! ===================================== +! The left singular vectors are stored in the array X. +! The right singular vectors are in the array W. +! The array W will later on contain the eigenvectors +! of a Rayleigh quotient. + NUMRNK = N + SELECT CASE ( WHTSVD ) + CASE (1) + CALL CGESVD( 'O', 'S', M, N, X, LDX, RWORK, B, & + LDB, W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 ) ! LAPACK CALL + T_OR_N = 'C' + CASE (2) + CALL CGESDD( 'O', M, N, X, LDX, RWORK, B, LDB, W, & + LDW, ZWORK, LZWORK, RWORK(N+1), IWORK, INFO1 ) ! LAPACK CALL + T_OR_N = 'C' + CASE (3) + CALL CGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, RWORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, LIWORK, ZWORK, & + LZWORK, RWORK(N+1), LRWORK-N, INFO1) ! LAPACK CALL + CALL CLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'C' + CASE (4) + CALL CGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, & + N, X, LDX, RWORK, Z, LDZ, W, LDW, & + ZWORK, LZWORK, RWORK(N+1), LRWORK-N, IWORK, INFO1 ) ! LAPACK CALL + CALL CLACPY( 'A', M, N, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'N' + XSCL1 = RWORK(N+1) + XSCL2 = RWORK(N+2) + IF ( XSCL1 /= XSCL2 ) THEN + ! This is an exceptional situation. If the + ! data matrices are not scaled and the + ! largest singular value of X overflows. + ! In that case CGEJSV can return the SVD + ! in scaled form. The scaling factor can be used + ! to rescale the data (X and Y). + CALL CLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2 ) + END IF + END SELECT +! + IF ( INFO1 > 0 ) THEN + ! The SVD selected subroutine did not converge. + ! Return with an error code. + INFO = 2 + RETURN + END IF +! + IF ( RWORK(1) == ZERO ) THEN + ! The largest computed singular value of (scaled) + ! X is zero. Return error code -8 + ! (the 8th input variable had an illegal value). + K = 0 + INFO = -8 + CALL XERBLA('CGEDMD',-INFO) + RETURN + END IF +! + !<3> Determine the numerical rank of the data + ! snapshots matrix X. This depends on the + ! parameters NRNK and TOL. + + SELECT CASE ( NRNK ) + CASE ( -1 ) + K = 1 + DO i = 2, NUMRNK + IF ( ( RWORK(i) <= RWORK(1)*TOL ) .OR. & + ( RWORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE ( -2 ) + K = 1 + DO i = 1, NUMRNK-1 + IF ( ( RWORK(i+1) <= RWORK(i)*TOL ) .OR. & + ( RWORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE DEFAULT + K = 1 + DO i = 2, NRNK + IF ( RWORK(i) <= SMALL ) EXIT + K = K + 1 + END DO + END SELECT + ! Now, U = X(1:M,1:K) is the SVD/POD basis for the + ! snapshot data in the input matrix X. + + !<4> Compute the Rayleigh quotient S = U^H * A * U. + ! Depending on the requested outputs, the computation + ! is organized to compute additional auxiliary + ! matrices (for the residuals and refinements). + ! + ! In all formulas below, we need V_k*Sigma_k^(-1) + ! where either V_k is in W(1:N,1:K), or V_k^H is in + ! W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). + IF ( LSAME(T_OR_N, 'N') ) THEN + DO i = 1, K + CALL CSSCAL( N, ONE/RWORK(i), W(1,i), 1 ) ! BLAS CALL + ! W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i) ! INTRINSIC + END DO + ELSE + ! This non-unit stride access is due to the fact + ! that CGESVD, CGESVDQ and CGESDD return the + ! adjoint matrix of the right singular vectors. + !DO i = 1, K + ! CALL DSCAL( N, ONE/RWORK(i), W(i,1), LDW ) ! BLAS CALL + ! ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N) ! INTRINSIC + !END DO + DO i = 1, K + RWORK(N+i) = ONE/RWORK(i) + END DO + DO j = 1, N + DO i = 1, K + W(i,j) = CMPLX(RWORK(N+i),ZERO,KIND=WP)*W(i,j) + END DO + END DO + END IF +! + IF ( WNTREF ) THEN + ! + ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) + ! for computing the refined Ritz vectors + ! (optionally, outside CGEDMD). + CALL CGEMM( 'N', T_OR_N, M, K, N, ZONE, Y, LDY, W, & + LDW, ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRINSIC, for T_OR_N=='N' + ! + ! At this point Z contains + ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and + ! this is needed for computing the residuals. + ! This matrix is returned in the array B and + ! it can be used to compute refined Ritz vectors. + CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) ! BLAS CALL + ! B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC + + CALL CGEMM( 'C', 'N', K, K, M, ZONE, X, LDX, Z, & + LDZ, ZZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC + ! At this point S = U^H * A * U is the Rayleigh quotient. + ELSE + ! A * U(:,1:K) is not explicitly needed and the + ! computation is organized differently. The Rayleigh + ! quotient is computed more efficiently. + CALL CGEMM( 'C', 'N', K, N, M, ZONE, X, LDX, Y, LDY, & + ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! INTRINSIC + ! + CALL CGEMM( 'N', T_OR_N, K, K, N, ZONE, Z, LDZ, W, & + LDW, ZZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRINSIC, for T_OR_N=='N' + ! At this point S = U^H * A * U is the Rayleigh quotient. + ! If the residuals are requested, save scaled V_k into Z. + ! Recall that V_k or V_k^H is stored in W. + IF ( WNTRES .OR. WNTEX ) THEN + IF ( LSAME(T_OR_N, 'N') ) THEN + CALL CLACPY( 'A', N, K, W, LDW, Z, LDZ ) + ELSE + CALL CLACPY( 'A', K, N, W, LDW, Z, LDZ ) + END IF + END IF + END IF +! + !<5> Compute the Ritz values and (if requested) the + ! right eigenvectors of the Rayleigh quotient. + ! + CALL CGEEV( 'N', JOBZL, K, S, LDS, EIGS, W, & + LDW, W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 ) ! LAPACK CALL + ! + ! W(1:K,1:K) contains the eigenvectors of the Rayleigh + ! quotient. See the description of Z. + ! Also, see the description of CGEEV. + IF ( INFO1 > 0 ) THEN + ! CGEEV failed to compute the eigenvalues and + ! eigenvectors of the Rayleigh quotient. + INFO = 3 + RETURN + END IF +! + ! <6> Compute the eigenvectors (if requested) and, + ! the residuals (if requested). + ! + IF ( WNTVEC .OR. WNTEX ) THEN + IF ( WNTRES ) THEN + IF ( WNTREF ) THEN + ! Here, if the refinement is requested, we have + ! A*U(:,1:K) already computed and stored in Z. + ! For the residuals, need Y = A * U(:,1;K) * W. + CALL CGEMM( 'N', 'N', M, K, K, ZONE, Z, LDZ, W, & + LDW, ZZERO, Y, LDY ) ! BLAS CALL + ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC + ! This frees Z; Y contains A * U(:,1:K) * W. + ELSE + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) (or its adjoint) is stored in Z + CALL CGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, & + W, LDW, ZZERO, S, LDS) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + LDS, ZZERO, Z, LDZ) + ! Save a copy of Z into Y and free Z for holding + ! the Ritz vectors. + CALL CLACPY( 'A', M, K, Z, LDZ, Y, LDY ) + IF ( WNTEX ) CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF + ELSE IF ( WNTEX ) THEN + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL CGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, & + W, LDW, ZZERO, S, LDS) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + LDS, ZZERO, B, LDB) + ! The above call replaces the following two calls + ! that were used in the developing-testing phase. + ! CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + ! LDS, ZZERO, Z, LDZ) + ! Save a copy of Z into Y and free Z for holding + ! the Ritz vectors. + ! CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF +! + ! Compute the Ritz vectors + IF ( WNTVEC ) CALL CGEMM( 'N', 'N', M, K, K, ZONE, X, LDX, W, LDW, & + ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC +! + IF ( WNTRES ) THEN + DO i = 1, K + CALL CAXPY( M, -EIGS(i), Z(1,i), 1, Y(1,i), 1 ) ! BLAS CALL + ! Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i) ! INTRINSIC + RES(i) = SCNRM2( M, Y(1,i), 1) ! BLAS CALL + END DO + END IF + END IF +! + IF ( WHTSVD == 4 ) THEN + RWORK(N+1) = XSCL1 + RWORK(N+2) = XSCL2 + END IF +! +! Successful exit. + IF ( .NOT. BADXY ) THEN + INFO = 0 + ELSE + ! A warning on possible data inconsistency. + ! This should be a rare event. + INFO = 4 + END IF +!............................................................ + RETURN +! ...... + END SUBROUTINE CGEDMD + diff --git a/lapack-netlib/SRC/cgedmdq.f90 b/lapack-netlib/SRC/cgedmdq.f90 new file mode 100644 index 000000000..52c1669c7 --- /dev/null +++ b/lapack-netlib/SRC/cgedmdq.f90 @@ -0,0 +1,689 @@ +SUBROUTINE CGEDMDQ( JOBS, JOBZ, JOBR, JOBQ, JOBT, JOBF, & + WHTSVD, M, N, F, LDF, X, LDX, Y, & + LDY, NRNK, TOL, K, EIGS, & + Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK, LZWORK, WORK, LWORK, & + IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBQ, & + JOBT, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDF, LDX, & + LDY, NRNK, LDZ, LDB, LDV, & + LDS, LZWORK, LWORK, LIWORK + INTEGER, INTENT(OUT) :: INFO, K + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + COMPLEX(KIND=WP), INTENT(INOUT) :: F(LDF,*) + COMPLEX(KIND=WP), INTENT(OUT) :: X(LDX,*), Y(LDY,*), & + Z(LDZ,*), B(LDB,*), & + V(LDV,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!..... +! Purpose +! ======= +! CGEDMDQ computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices, using a QR factorization +! based compression of the data. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, CGEDMDQ computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, CGEDMDQ returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office. +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!...................................................................... +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. The data snapshots are the columns +! of F. The leading N-1 columns of F are denoted X and the +! trailing N-1 columns are denoted Y. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Z*V, where Z +! is orthonormal and V contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of F, V, Z. +! 'Q' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Q*Z, where Z +! contains the eigenvectors of the compression of the +! underlying discretised operator onto the span of +! the data snapshots. See the descriptions of F, V, Z. +! Q is from the inital QR facorization. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will +! be computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBQ (input) CHARACTER*1 +! Specifies whether to explicitly compute and return the +! unitary matrix from the QR factorization. +! 'Q' :: The matrix Q of the QR factorization of the data +! snapshot matrix is computed and stored in the +! array F. See the description of F. +! 'N' :: The matrix Q is not explicitly computed. +!..... +! JOBT (input) CHARACTER*1 +! Specifies whether to return the upper triangular factor +! from the QR factorization. +! 'R' :: The matrix R of the QR factorization of the data +! snapshot matrix F is returned in the array Y. +! See the description of Y and Further details. +! 'N' :: The matrix R is not returned. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +! To be useful on exit, this option needs JOBQ='Q'. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: CGESVD (the QR SVD algorithm) +! 2 :: CGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M >= 0 +! The state space dimension (the number of rows of F). +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshots from a single trajectory, +! taken at equidistant discrete times. This is the +! number of columns of F. +!..... +! F (input/output) COMPLEX(KIND=WP) M-by-N array +! > On entry, +! the columns of F are the sequence of data snapshots +! from a single trajectory, taken at equidistant discrete +! times. It is assumed that the column norms of F are +! in the range of the normalized floating point numbers. +! < On exit, +! If JOBQ == 'Q', the array F contains the orthogonal +! matrix/factor of the QR factorization of the initial +! data snapshots matrix F. See the description of JOBQ. +! If JOBQ == 'N', the entries in F strictly below the main +! diagonal contain, column-wise, the information on the +! Householder vectors, as returned by CGEQRF. The +! remaining information to restore the orthogonal matrix +! of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). +! See the description of ZWORK. +!..... +! LDF (input) INTEGER, LDF >= M +! The leading dimension of the array F. +!..... +! X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array +! X is used as workspace to hold representations of the +! leading N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, the leading K columns of X contain the leading +! K left singular vectors of the above described content +! of X. To lift them to the space of the left singular +! vectors U(:,1:K) of the input data, pre-multiply with the +! Q factor from the initial QR factorization. +! See the descriptions of F, K, V and Z. +!..... +! LDX (input) INTEGER, LDX >= N +! The leading dimension of the array X. +!..... +! Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array +! Y is used as workspace to hold representations of the +! trailing N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, +! If JOBT == 'R', Y contains the MIN(M,N)-by-N upper +! triangular factor from the QR factorization of the data +! snapshot matrix F. +!..... +! LDY (input) INTEGER , LDY >= N +! The leading dimension of the array Y. +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N-1 :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the description of K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the SVD/POD basis for the leading N-1 +! data snapshots (columns of F) and the number of the +! computed Ritz pairs. The value of K is determined +! according to the rule set by the parameters NRNK and +! TOL. See the descriptions of NRNK and TOL. +!..... +! EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array +! The leading K (K<=N-1) entries of EIGS contain +! the computed eigenvalues (Ritz values). +! See the descriptions of K, and Z. +!..... +! Z (workspace/output) COMPLEX(KIND=WP) M-by-(N-1) array +! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +! Z*V, where Z contains orthonormal matrix (the product of +! Q from the initial QR factorization and the SVD/POD_basis +! returned by CGEDMD in X) and the second factor (the +! eigenvectors of the Rayleigh quotient) is in the array V, +! as returned by CGEDMD. That is, X(:,1:K)*V(:,i) +! is an eigenvector corresponding to EIGS(i). The columns +! of V(1:K,1:K) are the computed eigenvectors of the +! K-by-K Rayleigh quotient. +! See the descriptions of EIGS, X and V. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) (N-1)-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs, +! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +! See the description of EIGS and Z. +!..... +! B (output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array. +! IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:N,1;K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! In both cases, the content of B can be lifted to the +! original dimension of the input data by pre-multiplying +! with the Q factor from the initial QR factorization. +! Here A denotes a compression of the underlying operator. +! See the descriptions of F and X. +! If JOBF =='N', then B is not referenced. +!..... +! LDB (input) INTEGER, LDB >= MIN(M,N) +! The leading dimension of the array B. +!..... +! V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array +! On exit, V(1:K,1:K) V contains the K eigenvectors of +! the Rayleigh quotient. The Ritz vectors +! (returned in Z) are the product of Q from the initial QR +! factorization (see the description of F) X (see the +! description of X) and V. +!..... +! LDV (input) INTEGER, LDV >= N-1 +! The leading dimension of the array V. +!..... +! S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by CGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N-1 +! The leading dimension of the array S. +!..... +! ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array +! On exit, +! ZWORK(1:MIN(M,N)) contains the scalar factors of the +! elementary reflectors as returned by CGEQRF of the +! M-by-N input matrix F. +! If the call to CGEDMDQ is only workspace query, then +! ZWORK(1) contains the minimal complex workspace length and +! ZWORK(2) is the optimal complex workspace length. +! Hence, the length of work is at least 2. +! See the description of LZWORK. +!..... +! LZWORK (input) INTEGER +! The minimal length of the workspace vector ZWORK. +! LZWORK is calculated as follows: +! Let MLWQR = N (minimal workspace for CGEQRF[M,N]) +! MLWDMD = minimal workspace for CGEDMD (see the +! description of LWORK in CGEDMD) +! MLWMQR = N (minimal workspace for +! ZUNMQR['L','N',M,N,N]) +! MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) +! MINMN = MIN(M,N) +! Then +! LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) +! is further updated as follows: +! if JOBZ == 'V' or JOBZ == 'F' THEN +! LZWORK = MAX( LZWORK, MINMN+MLWMQR ) +! if JOBQ == 'Q' THEN +! LZWORK = MAX( ZLWORK, MINMN+MLWGQR) +! +!..... +! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +! On exit, +! WORK(1:N-1) contains the singular values of +! the input submatrix F(1:M,1:N-1). +! If the call to CGEDMDQ is only workspace query, then +! WORK(1) contains the minimal workspace length and +! WORK(2) is the optimal workspace length. hence, the +! length of work is at least 2. +! See the description of LWORK. +!..... +! LWORK (input) INTEGER +! The minimal length of the workspace vector WORK. +! LWORK is the same as in CGEDMD, because in CGEDMDQ +! only CGEDMD requires real workspace for snapshots +! of dimensions MIN(M,N)-by-(N-1). +! If on entry LWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! Let M1=MIN(M,N), N1=N-1. Then +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +! If on entry LIWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +! COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) +! +! Local scalars +! ~~~~~~~~~~~~~ + INTEGER :: IMINWR, INFO1, MINMN, MLRWRK, & + MLWDMD, MLWGQR, MLWMQR, MLWORK, & + MLWQR, OLWDMD, OLWGQR, OLWMQR, & + OLWORK, OLWQR + LOGICAL :: LQUERY, SCCOLX, SCCOLY, WANTQ, & + WNTTRF, WNTRES, WNTVEC, WNTVCF, & + WNTVCQ, WNTREF, WNTEX + CHARACTER(LEN=1) :: JOBVL +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + LOGICAL LSAME + EXTERNAL LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL CGEQRF, CLACPY, CLASET, CUNGQR, & + CUNMQR, XERBLA + +! External subroutines +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL CGEDMD + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC MAX, MIN, INT + !.......................................................... + ! + ! Test the input arguments + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' ) + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTVCF = LSAME(JOBZ,'F') + WNTVCQ = LSAME(JOBZ,'Q') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + WANTQ = LSAME(JOBQ,'Q') + WNTTRF = LSAME(JOBT,'R') + MINMN = MIN(M,N) + INFO = 0 + LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ & + .OR. LSAME(JOBZ,'N')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN + INFO = -4 + ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) ) THEN + INFO = -5 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -6 + ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR. & + (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN + INFO = -7 + ELSE IF ( M < 0 ) THEN + INFO = -8 + ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN + INFO = -9 + ELSE IF ( LDF < M ) THEN + INFO = -11 + ELSE IF ( LDX < MINMN ) THEN + INFO = -13 + ELSE IF ( LDY < MINMN ) THEN + INFO = -15 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -16 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -17 + ELSE IF ( LDZ < M ) THEN + INFO = -21 + ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN + INFO = -24 + ELSE IF ( LDV < N-1 ) THEN + INFO = -26 + ELSE IF ( LDS < N-1 ) THEN + INFO = -28 + END IF +! + IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN + JOBVL = 'V' + ELSE + JOBVL = 'N' + END IF + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN + ! All output except K is void. INFO=1 signals + ! the void input. In case of a workspace query, + ! the minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + + MLRWRK = 2 + MLWORK = 2 + OLWORK = 2 + IMINWR = 1 + MLWQR = MAX(1,N) ! Minimal workspace length for CGEQRF. + MLWORK = MAX(MLWORK,MINMN + MLWQR) + + IF ( LQUERY ) THEN + CALL CGEQRF( M, N, F, LDF, ZWORK, ZWORK, -1, & + INFO1 ) + OLWQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,MINMN + OLWQR) + END IF + CALL CGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + EIGS, Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK, LZWORK, WORK, -1, IWORK,& + LIWORK, INFO1 ) + MLWDMD = INT(ZWORK(1)) + MLWORK = MAX(MLWORK, MINMN + MLWDMD) + MLRWRK = MAX(MLRWRK, INT(WORK(1))) + IMINWR = MAX(IMINWR, IWORK(1)) + IF ( LQUERY ) THEN + OLWDMD = INT(ZWORK(2)) + OLWORK = MAX(OLWORK, MINMN+OLWDMD) + END IF + IF ( WNTVEC .OR. WNTVCF ) THEN + MLWMQR = MAX(1,N) + MLWORK = MAX(MLWORK, MINMN+MLWMQR) + IF ( LQUERY ) THEN + CALL CUNMQR( 'L','N', M, N, MINMN, F, LDF, & + ZWORK, Z, LDZ, ZWORK, -1, INFO1 ) + OLWMQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK, MINMN+OLWMQR) + END IF + END IF + IF ( WANTQ ) THEN + MLWGQR = MAX(1,N) + MLWORK = MAX(MLWORK, MINMN+MLWGQR) + IF ( LQUERY ) THEN + CALL CUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, & + ZWORK, -1, INFO1 ) + OLWGQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK, MINMN+OLWGQR) + END IF + END IF + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -34 + IF ( LWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -32 + IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -30 + END IF + IF( INFO /= 0 ) THEN + CALL XERBLA( 'CGEDMDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + ZWORK(1) = MLWORK + ZWORK(2) = OLWORK + WORK(1) = MLRWRK + WORK(2) = MLRWRK + RETURN + END IF +!..... +! Initial QR factorization that is used to represent the +! snapshots as elements of lower dimensional subspace. +! For large scale computation with M >>N , at this place +! one can use an out of core QRF. +! + CALL CGEQRF( M, N, F, LDF, ZWORK, & + ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) +! +! Define X and Y as the snapshots representations in the +! orthogonal basis computed in the QR factorization. +! X corresponds to the leading N-1 and Y to the trailing +! N-1 snapshots. + CALL CLASET( 'L', MINMN, N-1, ZZERO, ZZERO, X, LDX ) + CALL CLACPY( 'U', MINMN, N-1, F, LDF, X, LDX ) + CALL CLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY ) + IF ( M >= 3 ) THEN + CALL CLASET( 'L', MINMN-2, N-2, ZZERO, ZZERO, & + Y(3,1), LDY ) + END IF +! +! Compute the DMD of the projected snapshot pairs (X,Y) + CALL CGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, & + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + EIGS, Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK(MINMN+1), LZWORK-MINMN, & + WORK, LWORK, IWORK, LIWORK, INFO1 ) + IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN + ! Return with error code. See CGEDMD for details. + INFO = INFO1 + RETURN + ELSE + INFO = INFO1 + END IF +! +! The Ritz vectors (Koopman modes) can be explicitly +! formed or returned in factored form. + IF ( WNTVEC ) THEN + ! Compute the eigenvectors explicitly. + IF ( M > MINMN ) CALL CLASET( 'A', M-MINMN, K, ZZERO, & + ZZERO, Z(MINMN+1,1), LDZ ) + CALL CUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, & + LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + ELSE IF ( WNTVCF ) THEN + ! Return the Ritz vectors (eigenvectors) in factored + ! form Z*V, where Z contains orthonormal matrix (the + ! product of Q from the initial QR factorization and + ! the SVD/POD_basis returned by CGEDMD in X) and the + ! second factor (the eigenvectors of the Rayleigh + ! quotient) is in the array V, as returned by CGEDMD. + CALL CLACPY( 'A', N, K, X, LDX, Z, LDZ ) + IF ( M > N ) CALL CLASET( 'A', M-N, K, ZZERO, ZZERO, & + Z(N+1,1), LDZ ) + CALL CUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, & + LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + END IF +! +! Some optional output variables: +! +! The upper triangular factor R in the initial QR +! factorization is optionally returned in the array Y. +! This is useful if this call to CGEDMDQ is to be + +! followed by a streaming DMD that is implemented in a +! QR compressed form. + IF ( WNTTRF ) THEN ! Return the upper triangular R in Y + CALL CLASET( 'A', MINMN, N, ZZERO, ZZERO, Y, LDY ) + CALL CLACPY( 'U', MINMN, N, F, LDF, Y, LDY ) + END IF +! +! The orthonormal/unitary factor Q in the initial QR +! factorization is optionally returned in the array F. +! Same as with the triangular factor above, this is +! useful in a streaming DMD. + IF ( WANTQ ) THEN ! Q overwrites F + CALL CUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, & + ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + END IF +! + RETURN +! + END SUBROUTINE CGEDMDQ + \ No newline at end of file diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90 new file mode 100644 index 000000000..20424808f --- /dev/null +++ b/lapack-netlib/SRC/dgedmd.f90 @@ -0,0 +1,1054 @@ + SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!............................................................ +! Purpose +! ======= +! DGEDMD computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, DGEDMD computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, DGEDMD returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +!...................................................................... +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!............................................................ +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product X(:,1:K)*W, where X +! contains a POD basis (leading left singular vectors +! of the data matrix X) and W contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of K, X, W, Z. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will be +! computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: DGESVD (the QR SVD algorithm) +! 2 :: DGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M>= 0 +! The state space dimension (the row dimension of X, Y). +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshot pairs +! (the number of columns of X and Y). +!..... +! X (input/output) REAL(KIND=WP) M-by-N array +! > On entry, X contains the data snapshot matrix X. It is +! assumed that the column norms of X are in the range of +! the normalized floating point numbers. +! < On exit, the leading K columns of X contain a POD basis, +! i.e. the leading K left singular vectors of the input +! data matrix X, U(:,1:K). All N columns of X contain all +! left singular vectors of the input matrix X. +! See the descriptions of K, Z and W. +!..... +! LDX (input) INTEGER, LDX >= M +! The leading dimension of the array X. +!..... +! Y (input/workspace/output) REAL(KIND=WP) M-by-N array +! > On entry, Y contains the data snapshot matrix Y +! < On exit, +! If JOBR == 'R', the leading K columns of Y contain +! the residual vectors for the computed Ritz pairs. +! See the description of RES. +! If JOBR == 'N', Y contains the original input data, +! scaled according to the value of JOBS. +!..... +! LDY (input) INTEGER , LDY >= M +! The leading dimension of the array Y. +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1). +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the descriptions of TOL and K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the POD basis for the data snapshot +! matrix X and the number of the computed Ritz pairs. +! The value of K is determined according to the rule set +! by the parameters NRNK and TOL. +! See the descriptions of NRNK and TOL. +!..... +! REIG (output) REAL(KIND=WP) N-by-1 array +! The leading K (K<=N) entries of REIG contain +! the real parts of the computed eigenvalues +! REIG(1:K) + sqrt(-1)*IMEIG(1:K). +! See the descriptions of K, IMEIG, and Z. +!..... +! IMEIG (output) REAL(KIND=WP) N-by-1 array +! The leading K (K<=N) entries of IMEIG contain +! the imaginary parts of the computed eigenvalues +! REIG(1:K) + sqrt(-1)*IMEIG(1:K). +! The eigenvalues are determined as follows: +! If IMEIG(i) == 0, then the corresponding eigenvalue is +! real, LAMBDA(i) = REIG(i). +! If IMEIG(i)>0, then the corresponding complex +! conjugate pair of eigenvalues reads +! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +! That is, complex conjugate pairs have consecutive +! indices (i,i+1), with the positive imaginary part +! listed first. +! See the descriptions of K, REIG, and Z. +!..... +! Z (workspace/output) REAL(KIND=WP) M-by-N array +! If JOBZ =='V' then +! Z contains real Ritz vectors as follows: +! If IMEIG(i)=0, then Z(:,i) is an eigenvector of +! the i-th Ritz value; ||Z(:,i)||_2=1. +! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +! [Z(:,i) Z(:,i+1)] span an invariant subspace and +! the Ritz values extracted from this subspace are +! REIG(i) + sqrt(-1)*IMEIG(i) and +! REIG(i) - sqrt(-1)*IMEIG(i). +! The corresponding eigenvectors are +! Z(:,i) + sqrt(-1)*Z(:,i+1) and +! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +! || Z(:,i:i+1)||_F = 1. +! If JOBZ == 'F', then the above descriptions hold for +! the columns of X(:,1:K)*W(1:K,1:K), where the columns +! of W(1:k,1:K) are the computed eigenvectors of the +! K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +! are similarly structured: If IMEIG(i) == 0 then +! X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +! then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +! X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +! See the descriptions of REIG, IMEIG, X and W. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) N-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs. +! If LAMBDA(i) is real, then +! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +! then +! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +! It holds that +! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +! See the description of REIG, IMEIG and Z. +!..... +! B (output) REAL(KIND=WP) M-by-N array. +! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:M,1;K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! If JOBF =='N', then B is not referenced. +! See the descriptions of X, W, K. +!..... +! LDB (input) INTEGER, LDB >= M +! The leading dimension of the array B. +!..... +! W (workspace/output) REAL(KIND=WP) N-by-N array +! On exit, W(1:K,1:K) contains the K computed +! eigenvectors of the matrix Rayleigh quotient (real and +! imaginary parts for each complex conjugate pair of the +! eigenvalues). The Ritz vectors (returned in Z) are the +! product of X (containing a POD basis for the input +! matrix X) and W. See the descriptions of K, S, X and Z. +! W is also used as a workspace to temporarily store the +! right singular vectors of X. +!..... +! LDW (input) INTEGER, LDW >= N +! The leading dimension of the array W. +!..... +! S (workspace/output) REAL(KIND=WP) N-by-N array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by DGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N +! The leading dimension of the array S. +!..... +! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +! On exit, WORK(1:N) contains the singular values of +! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +! If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +! scaling factor WORK(N+2)/WORK(N+1) used to scale X +! and Y to avoid overflow in the SVD of X. +! This may be of interest if the scaling option is off +! and as many as possible smallest eigenvalues are +! desired to the highest feasible accuracy. +! If the call to DGEDMD is only workspace query, then +! WORK(1) contains the minimal workspace length and +! WORK(2) is the optimal workspace length. Hence, the +! leng of work is at least 2. +! See the description of LWORK. +!..... +! LWORK (input) INTEGER +! The minimal length of the workspace vector WORK. +! LWORK is calculated as follows: +! If WHTSVD == 1 :: +! If JOBZ == 'V', then +! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +! If JOBZ == 'N' then +! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +! Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +! workspace length of DGESVD. +! If WHTSVD == 2 :: +! If JOBZ == 'V', then +! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +! If JOBZ == 'N', then +! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +! Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +! minimal workspace length of DGESDD. +! If WHTSVD == 3 :: +! If JOBZ == 'V', then +! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +! If JOBZ == 'N', then +! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +! Here LWORK_SVD = N+M+MAX(3*N+1, +! MAX(1,3*N+M,5*N),MAX(1,N)) +! is the minimal workspace length of DGESVDQ. +! If WHTSVD == 4 :: +! If JOBZ == 'V', then +! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +! If JOBZ == 'N', then +! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +! Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +! minimal workspace length of DGEJSV. +! The above expressions are not simplified in order to +! make the usage of WORK more transparent, and for +! easier checking. In any case, LWORK >= 2. +! If on entry LWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +! If on entry LIWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + +! Local scalars +! ~~~~~~~~~~~~~ + REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & + SSUM, XSCL1, XSCL2 + INTEGER :: i, j, IMINWR, INFO1, INFO2, & + LWRKEV, LWRSDD, LWRSVD, & + LWRSVQ, MLWORK, MWRKEV, MWRSDD, & + MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, & + OLWORK + LOGICAL :: BADXY, LQUERY, SCCOLX, SCCOLY, & + WNTEX, WNTREF, WNTRES, WNTVEC + CHARACTER :: JOBZL, T_OR_N + CHARACTER :: JSVOPT + +! Local arrays +! ~~~~~~~~~~~~ + REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + REAL(KIND=WP) DLANGE, DLAMCH, DNRM2 + EXTERNAL DLANGE, DLAMCH, DNRM2, IDAMAX + INTEGER IDAMAX + LOGICAL DISNAN, LSAME + EXTERNAL DISNAN, LSAME + +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL DAXPY, DGEMM, DSCAL + EXTERNAL DGEEV, DGEJSV, DGESDD, DGESVD, DGESVDQ, & + DLACPY, DLASCL, DLASSQ, XERBLA + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC DBLE, INT, MAX, SQRT +!............................................................ +! +! Test the input arguments +! + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C') + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + INFO = 0 + LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N') & + .OR. LSAME(JOBZ,'F')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. (.NOT.WNTVEC) ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -4 + ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR. & + (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN + INFO = -5 + ELSE IF ( M < 0 ) THEN + INFO = -6 + ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN + INFO = -7 + ELSE IF ( LDX < M ) THEN + INFO = -9 + ELSE IF ( LDY < M ) THEN + INFO = -11 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -12 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -13 + ELSE IF ( LDZ < M ) THEN + INFO = -18 + ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN + INFO = -21 + ELSE IF ( LDW < N ) THEN + INFO = -23 + ELSE IF ( LDS < N ) THEN + INFO = -25 + END IF +! + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( N == 0 ) THEN + ! Quick return. All output except K is void. + ! INFO=1 signals the void input. + ! In case of a workspace query, the default + ! minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + MLWORK = MAX(2,N) + OLWORK = MAX(2,N) + IMINWR = 1 + SELECT CASE ( WHTSVD ) + CASE (1) + ! The following is specified as the minimal + ! length of WORK in the definition of DGESVD: + ! MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + MLWORK = MAX(MLWORK,N + MWRSVD) + IF ( LQUERY ) THEN + CALL DGESVD( 'O', 'S', M, N, X, LDX, WORK, & + B, LDB, W, LDW, RDUMMY, -1, INFO1 ) + LWRSVD = MAX( MWRSVD, INT( RDUMMY(1) ) ) + OLWORK = MAX(OLWORK,N + LWRSVD) + END IF + CASE (2) + ! The following is specified as the minimal + ! length of WORK in the definition of DGESDD: + ! MWRSDD = 3*MIN(M,N)*MIN(M,N) + + ! MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) + ! IMINWR = 8*MIN(M,N) + MWRSDD = 3*MIN(M,N)*MIN(M,N) + & + MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) + MLWORK = MAX(MLWORK,N + MWRSDD) + IMINWR = 8*MIN(M,N) + IF ( LQUERY ) THEN + CALL DGESDD( 'O', M, N, X, LDX, WORK, B, & + LDB, W, LDW, RDUMMY, -1, IWORK, INFO1 ) + LWRSDD = MAX( MWRSDD, INT( RDUMMY(1) ) ) + OLWORK = MAX(OLWORK,N + LWRSDD) + END IF + CASE (3) + !LWQP3 = 3*N+1 + !LWORQ = MAX(N, 1) + !MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + !MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ ) + MAX(M,2) + !MLWORK = N + MWRSVQ + !IMINWR = M+N-1 + CALL DGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, WORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, LIWORK, RDUMMY, & + -1, RDUMMY2, -1, INFO1 ) + IMINWR = IWORK(1) + MWRSVQ = INT(RDUMMY(2)) + MLWORK = MAX(MLWORK,N+MWRSVQ+INT(RDUMMY2(1))) + IF ( LQUERY ) THEN + LWRSVQ = MAX( MWRSVQ, INT(RDUMMY(1)) ) + OLWORK = MAX(OLWORK,N+LWRSVQ+INT(RDUMMY2(1))) + END IF + CASE (4) + JSVOPT = 'J' + !MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N ) ! for JSVOPT='V' + MWRSVJ = MAX( 7, 2*M+N, 4*N+N*N, 2*N+N*N+6 ) + MLWORK = MAX(MLWORK,N+MWRSVJ) + IMINWR = MAX( 3, M+3*N ) + IF ( LQUERY ) THEN + OLWORK = MAX(OLWORK,N+MWRSVJ) + END IF + END SELECT + IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN + JOBZL = 'V' + ELSE + JOBZL = 'N' + END IF + ! Workspace calculation to the DGEEV call + IF ( LSAME(JOBZL,'V') ) THEN + MWRKEV = MAX( 1, 4*N ) + ELSE + MWRKEV = MAX( 1, 3*N ) + END IF + MLWORK = MAX(MLWORK,N+MWRKEV) + IF ( LQUERY ) THEN + CALL DGEEV( 'N', JOBZL, N, S, LDS, REIG, & + IMEIG, W, LDW, W, LDW, RDUMMY, -1, INFO1 ) + LWRKEV = MAX( MWRKEV, INT(RDUMMY(1)) ) + OLWORK = MAX( OLWORK, N+LWRKEV ) + END IF +! + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -29 + IF ( LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -27 + END IF +! + IF( INFO /= 0 ) THEN + CALL XERBLA( 'DGEDMD', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + WORK(1) = MLWORK + WORK(2) = OLWORK + RETURN + END IF +!............................................................ +! + OFL = DLAMCH('O') + SMALL = DLAMCH('S') + BADXY = .FALSE. +! +! <1> Optional scaling of the snapshots (columns of X, Y) +! ========================================================== + IF ( SCCOLX ) THEN + ! The columns of X will be normalized. + ! To prevent overflows, the column norms of X are + ! carefully computed using DLASSQ. + K = 0 + DO i = 1, N + !WORK(i) = DNRM2( M, X(1,i), 1 ) + SCALE = ZERO + CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM ) + IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN + K = 0 + INFO = -8 + CALL XERBLA('DGEDMD',-INFO) + END IF + IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of X(:,i) overflows. First, X(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. +! Next, the norm of X(:,i) is stored without +! overflow as WORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of X(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL DLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, X(1,i), M, INFO2 ) + WORK(i) = - SCALE * ( ROOTSC / DBLE(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + WORK(i) = SCALE * ROOTSC + CALL DLASCL( 'G',0, 0, WORK(i), ONE, M, 1, & + X(1,i), M, INFO2 ) ! LAPACK CALL +! X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC + END IF + ELSE + WORK(i) = ZERO + K = K + 1 + END IF + END DO + IF ( K == N ) THEN + ! All columns of X are zero. Return error code -8. + ! (the 8th input variable had an illegal value) + K = 0 + INFO = -8 + CALL XERBLA('DGEDMD',-INFO) + RETURN + END IF + DO i = 1, N +! Now, apply the same scaling to the columns of Y. + IF ( WORK(i) > ZERO ) THEN + CALL DSCAL( M, ONE/WORK(i), Y(1,i), 1 ) ! BLAS CALL +! Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC + ELSE IF ( WORK(i) < ZERO ) THEN + CALL DLASCL( 'G', 0, 0, -WORK(i), & + ONE/DBLE(M), M, 1, Y(1,i), M, INFO2 ) ! LAPACK CALL + ELSE IF ( Y(IDAMAX(M, Y(1,i),1),i ) & + /= ZERO ) THEN +! X(:,i) is zero vector. For consistency, +! Y(:,i) should also be zero. If Y(:,i) is not +! zero, then the data might be inconsistent or +! corrupted. If JOBS == 'C', Y(:,i) is set to +! zero and a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + IF ( LSAME(JOBS,'C')) & + CALL DSCAL( M, ZERO, Y(1,i), 1 ) ! BLAS CALL + END IF + END DO + END IF + ! + IF ( SCCOLY ) THEN + ! The columns of Y will be normalized. + ! To prevent overflows, the column norms of Y are + ! carefully computed using DLASSQ. + DO i = 1, N + !WORK(i) = DNRM2( M, Y(1,i), 1 ) + SCALE = ZERO + CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM ) + IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN + K = 0 + INFO = -10 + CALL XERBLA('DGEDMD',-INFO) + END IF + IF ( SCALE /= ZERO .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of Y(:,i) overflows. First, Y(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. +! Next, the norm of Y(:,i) is stored without +! overflow as WORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of Y(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL DLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, Y(1,i), M, INFO2 ) + WORK(i) = - SCALE * ( ROOTSC / DBLE(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + WORK(i) = SCALE * ROOTSC + CALL DLASCL( 'G',0, 0, WORK(i), ONE, M, 1, & + Y(1,i), M, INFO2 ) ! LAPACK CALL +! Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC + END IF + ELSE + WORK(i) = ZERO + END IF + END DO + DO i = 1, N +! Now, apply the same scaling to the columns of X. + IF ( WORK(i) > ZERO ) THEN + CALL DSCAL( M, ONE/WORK(i), X(1,i), 1 ) ! BLAS CALL +! X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC + ELSE IF ( WORK(i) < ZERO ) THEN + CALL DLASCL( 'G', 0, 0, -WORK(i), & + ONE/DBLE(M), M, 1, X(1,i), M, INFO2 ) ! LAPACK CALL + ELSE IF ( X(IDAMAX(M, X(1,i),1),i ) & + /= ZERO ) THEN +! Y(:,i) is zero vector. If X(:,i) is not +! zero, then a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + END IF + END DO + END IF +! +! <2> SVD of the data snapshot matrix X. +! ===================================== +! The left singular vectors are stored in the array X. +! The right singular vectors are in the array W. +! The array W will later on contain the eigenvectors +! of a Rayleigh quotient. + NUMRNK = N + SELECT CASE ( WHTSVD ) + CASE (1) + CALL DGESVD( 'O', 'S', M, N, X, LDX, WORK, B, & + LDB, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL + T_OR_N = 'T' + CASE (2) + CALL DGESDD( 'O', M, N, X, LDX, WORK, B, LDB, W, & + LDW, WORK(N+1), LWORK-N, IWORK, INFO1 ) ! LAPACK CALL + T_OR_N = 'T' + CASE (3) + CALL DGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, WORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, LIWORK, WORK(N+MAX(2,M)+1),& + LWORK-N-MAX(2,M), WORK(N+1), MAX(2,M), INFO1) ! LAPACK CALL + CALL DLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'T' + CASE (4) + CALL DGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, & + N, X, LDX, WORK, Z, LDZ, W, LDW, & + WORK(N+1), LWORK-N, IWORK, INFO1 ) ! LAPACK CALL + CALL DLACPY( 'A', M, N, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'N' + XSCL1 = WORK(N+1) + XSCL2 = WORK(N+2) + IF ( XSCL1 /= XSCL2 ) THEN + ! This is an exceptional situation. If the + ! data matrices are not scaled and the + ! largest singular value of X overflows. + ! In that case DGEJSV can return the SVD + ! in scaled form. The scaling factor can be used + ! to rescale the data (X and Y). + CALL DLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2 ) + END IF + END SELECT +! + IF ( INFO1 > 0 ) THEN + ! The SVD selected subroutine did not converge. + ! Return with an error code. + INFO = 2 + RETURN + END IF +! + IF ( WORK(1) == ZERO ) THEN + ! The largest computed singular value of (scaled) + ! X is zero. Return error code -8 + ! (the 8th input variable had an illegal value). + K = 0 + INFO = -8 + CALL XERBLA('DGEDMD',-INFO) + RETURN + END IF +! + !<3> Determine the numerical rank of the data + ! snapshots matrix X. This depends on the + ! parameters NRNK and TOL. + + SELECT CASE ( NRNK ) + CASE ( -1 ) + K = 1 + DO i = 2, NUMRNK + IF ( ( WORK(i) <= WORK(1)*TOL ) .OR. & + ( WORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE ( -2 ) + K = 1 + DO i = 1, NUMRNK-1 + IF ( ( WORK(i+1) <= WORK(i)*TOL ) .OR. & + ( WORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE DEFAULT + K = 1 + DO i = 2, NRNK + IF ( WORK(i) <= SMALL ) EXIT + K = K + 1 + END DO + END SELECT + ! Now, U = X(1:M,1:K) is the SVD/POD basis for the + ! snapshot data in the input matrix X. + + !<4> Compute the Rayleigh quotient S = U^T * A * U. + ! Depending on the requested outputs, the computation + ! is organized to compute additional auxiliary + ! matrices (for the residuals and refinements). + ! + ! In all formulas below, we need V_k*Sigma_k^(-1) + ! where either V_k is in W(1:N,1:K), or V_k^T is in + ! W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). + IF ( LSAME(T_OR_N, 'N') ) THEN + DO i = 1, K + CALL DSCAL( N, ONE/WORK(i), W(1,i), 1 ) ! BLAS CALL + ! W(1:N,i) = (ONE/WORK(i)) * W(1:N,i) ! INTRINSIC + END DO + ELSE + ! This non-unit stride access is due to the fact + ! that DGESVD, DGESVDQ and DGESDD return the + ! transposed matrix of the right singular vectors. + !DO i = 1, K + ! CALL DSCAL( N, ONE/WORK(i), W(i,1), LDW ) ! BLAS CALL + ! ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N) ! INTRINSIC + !END DO + DO i = 1, K + WORK(N+i) = ONE/WORK(i) + END DO + DO j = 1, N + DO i = 1, K + W(i,j) = (WORK(N+i))*W(i,j) + END DO + END DO + END IF +! + IF ( WNTREF ) THEN + ! + ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) + ! for computing the refined Ritz vectors + ! (optionally, outside DGEDMD). + CALL DGEMM( 'N', T_OR_N, M, K, N, ONE, Y, LDY, W, & + LDW, ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRINSIC, for T_OR_N=='N' + ! + ! At this point Z contains + ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and + ! this is needed for computing the residuals. + ! This matrix is returned in the array B and + ! it can be used to compute refined Ritz vectors. + CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) ! BLAS CALL + ! B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC + + CALL DGEMM( 'T', 'N', K, K, M, ONE, X, LDX, Z, & + LDZ, ZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC + ! At this point S = U^T * A * U is the Rayleigh quotient. + ELSE + ! A * U(:,1:K) is not explicitly needed and the + ! computation is organized differently. The Rayleigh + ! quotient is computed more efficiently. + CALL DGEMM( 'T', 'N', K, N, M, ONE, X, LDX, Y, LDY, & + ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! INTRINSIC + ! In the two DGEMM calls here, can use K for LDZ. + CALL DGEMM( 'N', T_OR_N, K, K, N, ONE, Z, LDZ, W, & + LDW, ZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRINSIC, for T_OR_N=='N' + ! At this point S = U^T * A * U is the Rayleigh quotient. + ! If the residuals are requested, save scaled V_k into Z. + ! Recall that V_k or V_k^T is stored in W. + IF ( WNTRES .OR. WNTEX ) THEN + IF ( LSAME(T_OR_N, 'N') ) THEN + CALL DLACPY( 'A', N, K, W, LDW, Z, LDZ ) + ELSE + CALL DLACPY( 'A', K, N, W, LDW, Z, LDZ ) + END IF + END IF + END IF +! + !<5> Compute the Ritz values and (if requested) the + ! right eigenvectors of the Rayleigh quotient. + ! + CALL DGEEV( 'N', JOBZL, K, S, LDS, REIG, IMEIG, W, & + LDW, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL + ! + ! W(1:K,1:K) contains the eigenvectors of the Rayleigh + ! quotient. Even in the case of complex spectrum, all + ! computation is done in real arithmetic. REIG and + ! IMEIG are the real and the imaginary parts of the + ! eigenvalues, so that the spectrum is given as + ! REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs + ! are listed at consecutive positions. For such a + ! complex conjugate pair of the eigenvalues, the + ! corresponding eigenvectors are also a complex + ! conjugate pair with the real and imaginary parts + ! stored column-wise in W at the corresponding + ! consecutive column indices. See the description of Z. + ! Also, see the description of DGEEV. + IF ( INFO1 > 0 ) THEN + ! DGEEV failed to compute the eigenvalues and + ! eigenvectors of the Rayleigh quotient. + INFO = 3 + RETURN + END IF +! + ! <6> Compute the eigenvectors (if requested) and, + ! the residuals (if requested). + ! + IF ( WNTVEC .OR. WNTEX ) THEN + IF ( WNTRES ) THEN + IF ( WNTREF ) THEN + ! Here, if the refinement is requested, we have + ! A*U(:,1:K) already computed and stored in Z. + ! For the residuals, need Y = A * U(:,1;K) * W. + CALL DGEMM( 'N', 'N', M, K, K, ONE, Z, LDZ, W, & + LDW, ZERO, Y, LDY ) ! BLAS CALL + ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC + ! This frees Z; Y contains A * U(:,1:K) * W. + ELSE + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL DGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, & + W, LDW, ZERO, S, LDS) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + LDS, ZERO, Z, LDZ) + ! Save a copy of Z into Y and free Z for holding + ! the Ritz vectors. + CALL DLACPY( 'A', M, K, Z, LDZ, Y, LDY ) + IF ( WNTEX ) CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF + ELSE IF ( WNTEX ) THEN + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL DGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, & + W, LDW, ZERO, S, LDS ) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + LDS, ZERO, B, LDB ) + ! The above call replaces the following two calls + ! that were used in the developing-testing phase. + ! CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + ! LDS, ZERO, Z, LDZ) + ! Save a copy of Z into B and free Z for holding + ! the Ritz vectors. + ! CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF +! + ! Compute the real form of the Ritz vectors + IF ( WNTVEC ) CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, & + ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC +! + IF ( WNTRES ) THEN + i = 1 + DO WHILE ( i <= K ) + IF ( IMEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y(1,i), 1 ) ! BLAS CALL + ! Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i) ! INTRINSIC + RES(i) = DNRM2( M, Y(1,i), 1) ! BLAS CALL + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IMEIG(i) + AB(1,2) = IMEIG(i) + AB(2,2) = REIG(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + LDZ, AB, 2, ONE, Y(1,i), LDY ) ! BLAS CALL + ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INTRINSIC + RES(i) = DLANGE( 'F', M, 2, Y(1,i), LDY, & + WORK(N+1) ) ! LAPACK CALL + RES(i+1) = RES(i) + i = i + 2 + END IF + END DO + END IF + END IF +! + IF ( WHTSVD == 4 ) THEN + WORK(N+1) = XSCL1 + WORK(N+2) = XSCL2 + END IF +! +! Successful exit. + IF ( .NOT. BADXY ) THEN + INFO = 0 + ELSE + ! A warning on possible data inconsistency. + ! This should be a rare event. + INFO = 4 + END IF +!............................................................ + RETURN +! ...... + END SUBROUTINE DGEDMD + diff --git a/lapack-netlib/SRC/dgedmdq.f90 b/lapack-netlib/SRC/dgedmdq.f90 new file mode 100644 index 000000000..bedfba472 --- /dev/null +++ b/lapack-netlib/SRC/dgedmdq.f90 @@ -0,0 +1,704 @@ +SUBROUTINE DGEDMDQ( JOBS, JOBZ, JOBR, JOBQ, JOBT, JOBF, & + WHTSVD, M, N, F, LDF, X, LDX, Y, & + LDY, NRNK, TOL, K, REIG, IMEIG, & + Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, WORK, LWORK, IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBQ, & + JOBT, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDF, LDX, & + LDY, NRNK, LDZ, LDB, LDV, & + LDS, LWORK, LIWORK + INTEGER, INTENT(OUT) :: INFO, K + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + REAL(KIND=WP), INTENT(INOUT) :: F(LDF,*) + REAL(KIND=WP), INTENT(OUT) :: X(LDX,*), Y(LDY,*), & + Z(LDZ,*), B(LDB,*), & + V(LDV,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!..... +! Purpose +! ======= +! DGEDMDQ computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices, using a QR factorization +! based compression of the data. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, DGEDMDQ computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, DGEDMDQ returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office. +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!...................................................................... +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. The data snapshots are the columns +! of F. The leading N-1 columns of F are denoted X and the +! trailing N-1 columns are denoted Y. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Z*V, where Z +! is orthonormal and V contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of F, V, Z. +! 'Q' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Q*Z, where Z +! contains the eigenvectors of the compression of the +! underlying discretized operator onto the span of +! the data snapshots. See the descriptions of F, V, Z. +! Q is from the initial QR factorization. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will +! be computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBQ (input) CHARACTER*1 +! Specifies whether to explicitly compute and return the +! orthogonal matrix from the QR factorization. +! 'Q' :: The matrix Q of the QR factorization of the data +! snapshot matrix is computed and stored in the +! array F. See the description of F. +! 'N' :: The matrix Q is not explicitly computed. +!..... +! JOBT (input) CHARACTER*1 +! Specifies whether to return the upper triangular factor +! from the QR factorization. +! 'R' :: The matrix R of the QR factorization of the data +! snapshot matrix F is returned in the array Y. +! See the description of Y and Further details. +! 'N' :: The matrix R is not returned. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +! To be useful on exit, this option needs JOBQ='Q'. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: DGESVD (the QR SVD algorithm) +! 2 :: DGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M >= 0 +! The state space dimension (the number of rows of F). +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshots from a single trajectory, +! taken at equidistant discrete times. This is the +! number of columns of F. +!..... +! F (input/output) REAL(KIND=WP) M-by-N array +! > On entry, +! the columns of F are the sequence of data snapshots +! from a single trajectory, taken at equidistant discrete +! times. It is assumed that the column norms of F are +! in the range of the normalized floating point numbers. +! < On exit, +! If JOBQ == 'Q', the array F contains the orthogonal +! matrix/factor of the QR factorization of the initial +! data snapshots matrix F. See the description of JOBQ. +! If JOBQ == 'N', the entries in F strictly below the main +! diagonal contain, column-wise, the information on the +! Householder vectors, as returned by DGEQRF. The +! remaining information to restore the orthogonal matrix +! of the initial QR factorization is stored in WORK(1:N). +! See the description of WORK. +!..... +! LDF (input) INTEGER, LDF >= M +! The leading dimension of the array F. +!..... +! X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array +! X is used as workspace to hold representations of the +! leading N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, the leading K columns of X contain the leading +! K left singular vectors of the above described content +! of X. To lift them to the space of the left singular +! vectors U(:,1:K)of the input data, pre-multiply with the +! Q factor from the initial QR factorization. +! See the descriptions of F, K, V and Z. +!..... +! LDX (input) INTEGER, LDX >= N +! The leading dimension of the array X. +!..... +! Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array +! Y is used as workspace to hold representations of the +! trailing N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, +! If JOBT == 'R', Y contains the MIN(M,N)-by-N upper +! triangular factor from the QR factorization of the data +! snapshot matrix F. +!..... +! LDY (input) INTEGER , LDY >= N +! The leading dimension of the array Y. +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N-1 :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the description of K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the SVD/POD basis for the leading N-1 +! data snapshots (columns of F) and the number of the +! computed Ritz pairs. The value of K is determined +! according to the rule set by the parameters NRNK and +! TOL. See the descriptions of NRNK and TOL. +!..... +! REIG (output) REAL(KIND=WP) (N-1)-by-1 array +! The leading K (K<=N) entries of REIG contain +! the real parts of the computed eigenvalues +! REIG(1:K) + sqrt(-1)*IMEIG(1:K). +! See the descriptions of K, IMEIG, Z. +!..... +! IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array +! The leading K (K0, then the corresponding complex +! conjugate pair of eigenvalues reads +! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +! That is, complex conjugate pairs have consequtive +! indices (i,i+1), with the positive imaginary part +! listed first. +! See the descriptions of K, REIG, Z. +!..... +! Z (workspace/output) REAL(KIND=WP) M-by-(N-1) array +! If JOBZ =='V' then +! Z contains real Ritz vectors as follows: +! If IMEIG(i)=0, then Z(:,i) is an eigenvector of +! the i-th Ritz value. +! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +! [Z(:,i) Z(:,i+1)] span an invariant subspace and +! the Ritz values extracted from this subspace are +! REIG(i) + sqrt(-1)*IMEIG(i) and +! REIG(i) - sqrt(-1)*IMEIG(i). +! The corresponding eigenvectors are +! Z(:,i) + sqrt(-1)*Z(:,i+1) and +! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +! If JOBZ == 'F', then the above descriptions hold for +! the columns of Z*V, where the columns of V are the +! eigenvectors of the K-by-K Rayleigh quotient, and Z is +! orthonormal. The columns of V are similarly structured: +! If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if +! IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and +! Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) +! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +! See the descriptions of REIG, IMEIG, X and V. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) (N-1)-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs. +! If LAMBDA(i) is real, then +! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +! then +! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +! It holds that +! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +! See the description of Z. +!..... +! B (output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array. +! IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:N,1;K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! In both cases, the content of B can be lifted to the +! original dimension of the input data by pre-multiplying +! with the Q factor from the initial QR factorization. +! Here A denotes a compression of the underlying operator. +! See the descriptions of F and X. +! If JOBF =='N', then B is not referenced. +!..... +! LDB (input) INTEGER, LDB >= MIN(M,N) +! The leading dimension of the array B. +!..... +! V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array +! On exit, V(1:K,1:K) contains the K eigenvectors of +! the Rayleigh quotient. The eigenvectors of a complex +! conjugate pair of eigenvalues are returned in real form +! as explained in the description of Z. The Ritz vectors +! (returned in Z) are the product of X and V; see +! the descriptions of X and Z. +!..... +! LDV (input) INTEGER, LDV >= N-1 +! The leading dimension of the array V. +!..... +! S (output) REAL(KIND=WP) (N-1)-by-(N-1) array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by DGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N-1 +! The leading dimension of the array S. +!..... +! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +! On exit, +! WORK(1:MIN(M,N)) contains the scalar factors of the +! elementary reflectors as returned by DGEQRF of the +! M-by-N input matrix F. +! WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of +! the input submatrix F(1:M,1:N-1). +! If the call to DGEDMDQ is only workspace query, then +! WORK(1) contains the minimal workspace length and +! WORK(2) is the optimal workspace length. Hence, the +! length of work is at least 2. +! See the description of LWORK. +!..... +! LWORK (input) INTEGER +! The minimal length of the workspace vector WORK. +! LWORK is calculated as follows: +! Let MLWQR = N (minimal workspace for DGEQRF[M,N]) +! MLWDMD = minimal workspace for DGEDMD (see the +! description of LWORK in DGEDMD) for +! snapshots of dimensions MIN(M,N)-by-(N-1) +! MLWMQR = N (minimal workspace for +! DORMQR['L','N',M,N,N]) +! MLWGQR = N (minimal workspace for DORGQR[M,N,N]) +! Then +! LWORK = MAX(N+MLWQR, N+MLWDMD) +! is updated as follows: +! if JOBZ == 'V' or JOBZ == 'F' THEN +! LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWMQR ) +! if JOBQ == 'Q' THEN +! LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWGQR) +! If on entry LWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! Let M1=MIN(M,N), N1=N-1. Then +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) +! If on entry LIWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +! +! Local scalars +! ~~~~~~~~~~~~~ + INTEGER :: IMINWR, INFO1, MLWDMD, MLWGQR, & + MLWMQR, MLWORK, MLWQR, MINMN, & + OLWDMD, OLWGQR, OLWMQR, OLWORK, & + OLWQR + LOGICAL :: LQUERY, SCCOLX, SCCOLY, WANTQ, & + WNTTRF, WNTRES, WNTVEC, WNTVCF, & + WNTVCQ, WNTREF, WNTEX + CHARACTER(LEN=1) :: JOBVL +! +! Local array +! ~~~~~~~~~~~ + REAL(KIND=WP) :: RDUMMY(2) +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + LOGICAL LSAME + EXTERNAL LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL DGEMM + EXTERNAL DGEQRF, DLACPY, DLASET, DORGQR, & + DORMQR, XERBLA + +! External subroutines +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL DGEDMD + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC MAX, MIN, INT + !.......................................................... + ! + ! Test the input arguments + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' ) + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTVCF = LSAME(JOBZ,'F') + WNTVCQ = LSAME(JOBZ,'Q') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + WANTQ = LSAME(JOBQ,'Q') + WNTTRF = LSAME(JOBT,'R') + MINMN = MIN(M,N) + INFO = 0 + LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ & + .OR. LSAME(JOBZ,'N')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN + INFO = -4 + ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) ) THEN + INFO = -5 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -6 + ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR. & + (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN + INFO = -7 + ELSE IF ( M < 0 ) THEN + INFO = -8 + ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN + INFO = -9 + ELSE IF ( LDF < M ) THEN + INFO = -11 + ELSE IF ( LDX < MINMN ) THEN + INFO = -13 + ELSE IF ( LDY < MINMN ) THEN + INFO = -15 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -16 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -17 + ELSE IF ( LDZ < M ) THEN + INFO = -22 + ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN + INFO = -25 + ELSE IF ( LDV < N-1 ) THEN + INFO = -27 + ELSE IF ( LDS < N-1 ) THEN + INFO = -29 + END IF +! + IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN + JOBVL = 'V' + ELSE + JOBVL = 'N' + END IF + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN + ! All output except K is void. INFO=1 signals + ! the void input. In case of a workspace query, + ! the minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + MLWQR = MAX(1,N) ! Minimal workspace length for DGEQRF. + MLWORK = MINMN + MLWQR + IF ( LQUERY ) THEN + CALL DGEQRF( M, N, F, LDF, WORK, RDUMMY, -1, & + INFO1 ) + OLWQR = INT(RDUMMY(1)) + OLWORK = MIN(M,N) + OLWQR + END IF + CALL DGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + REIG, IMEIG, Z, LDZ, RES, B, LDB, & + V, LDV, S, LDS, WORK, -1, IWORK, & + LIWORK, INFO1 ) + MLWDMD = INT(WORK(1)) + MLWORK = MAX(MLWORK, MINMN + MLWDMD) + IMINWR = IWORK(1) + IF ( LQUERY ) THEN + OLWDMD = INT(WORK(2)) + OLWORK = MAX(OLWORK, MINMN+OLWDMD) + END IF + IF ( WNTVEC .OR. WNTVCF ) THEN + MLWMQR = MAX(1,N) + MLWORK = MAX(MLWORK,MINMN+N-1+MLWMQR) + IF ( LQUERY ) THEN + CALL DORMQR( 'L','N', M, N, MINMN, F, LDF, & + WORK, Z, LDZ, WORK, -1, INFO1 ) + OLWMQR = INT(WORK(1)) + OLWORK = MAX(OLWORK,MINMN+N-1+OLWMQR) + END IF + END IF + IF ( WANTQ ) THEN + MLWGQR = N + MLWORK = MAX(MLWORK,MINMN+N-1+MLWGQR) + IF ( LQUERY ) THEN + CALL DORGQR( M, MINMN, MINMN, F, LDF, WORK, & + WORK, -1, INFO1 ) + OLWGQR = INT(WORK(1)) + OLWORK = MAX(OLWORK,MINMN+N-1+OLWGQR) + END IF + END IF + IMINWR = MAX( 1, IMINWR ) + MLWORK = MAX( 2, MLWORK ) + IF ( LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -31 + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -33 + END IF + IF( INFO /= 0 ) THEN + CALL XERBLA( 'DGEDMDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + WORK(1) = MLWORK + WORK(2) = OLWORK + RETURN + END IF +!..... +! Initial QR factorization that is used to represent the +! snapshots as elements of lower dimensional subspace. +! For large scale computation with M >>N , at this place +! one can use an out of core QRF. +! + CALL DGEQRF( M, N, F, LDF, WORK, & + WORK(MINMN+1), LWORK-MINMN, INFO1 ) +! +! Define X and Y as the snapshots representations in the +! orthogonal basis computed in the QR factorization. +! X corresponds to the leading N-1 and Y to the trailing +! N-1 snapshots. + CALL DLASET( 'L', MINMN, N-1, ZERO, ZERO, X, LDX ) + CALL DLACPY( 'U', MINMN, N-1, F, LDF, X, LDX ) + CALL DLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY ) + IF ( M >= 3 ) THEN + CALL DLASET( 'L', MINMN-2, N-2, ZERO, ZERO, & + Y(3,1), LDY ) + END IF +! +! Compute the DMD of the projected snapshot pairs (X,Y) + CALL DGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, & + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + REIG, IMEIG, Z, LDZ, RES, B, LDB, V, & + LDV, S, LDS, WORK(MINMN+1), LWORK-MINMN, & + IWORK, LIWORK, INFO1 ) + IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN + ! Return with error code. See DGEDMD for details. + INFO = INFO1 + RETURN + ELSE + INFO = INFO1 + END IF +! +! The Ritz vectors (Koopman modes) can be explicitly +! formed or returned in factored form. + IF ( WNTVEC ) THEN + ! Compute the eigenvectors explicitly. + IF ( M > MINMN ) CALL DLASET( 'A', M-MINMN, K, ZERO, & + ZERO, Z(MINMN+1,1), LDZ ) + CALL DORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z, & + LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + ELSE IF ( WNTVCF ) THEN + ! Return the Ritz vectors (eigenvectors) in factored + ! form Z*V, where Z contains orthonormal matrix (the + ! product of Q from the initial QR factorization and + ! the SVD/POD_basis returned by DGEDMD in X) and the + ! second factor (the eigenvectors of the Rayleigh + ! quotient) is in the array V, as returned by DGEDMD. + CALL DLACPY( 'A', N, K, X, LDX, Z, LDZ ) + IF ( M > N ) CALL DLASET( 'A', M-N, K, ZERO, ZERO, & + Z(N+1,1), LDZ ) + CALL DORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z, & + LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + END IF +! +! Some optional output variables: +! +! The upper triangular factor R in the initial QR +! factorization is optionally returned in the array Y. +! This is useful if this call to DGEDMDQ is to be +! followed by a streaming DMD that is implemented in a +! QR compressed form. + IF ( WNTTRF ) THEN ! Return the upper triangular R in Y + CALL DLASET( 'A', MINMN, N, ZERO, ZERO, Y, LDY ) + CALL DLACPY( 'U', MINMN, N, F, LDF, Y, LDY ) + END IF +! +! The orthonormal/orthogonal factor Q in the initial QR +! factorization is optionally returned in the array F. +! Same as with the triangular factor above, this is +! useful in a streaming DMD. + IF ( WANTQ ) THEN ! Q overwrites F + CALL DORGQR( M, MINMN, MINMN, F, LDF, WORK, & + WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + END IF +! + RETURN +! + END SUBROUTINE DGEDMDQ + \ No newline at end of file diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90 new file mode 100644 index 000000000..49cb11527 --- /dev/null +++ b/lapack-netlib/SRC/sgedmd.f90 @@ -0,0 +1,1054 @@ + SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!............................................................ +! Purpose +! ======= +! SGEDMD computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, SGEDMD computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, SGEDMD returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +!...................................................................... +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!...................................................................... +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product X(:,1:K)*W, where X +! contains a POD basis (leading left singular vectors +! of the data matrix X) and W contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of K, X, W, Z. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will be +! computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: SGESVD (the QR SVD algorithm) +! 2 :: SGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M>= 0 +! The state space dimension (the row dimension of X, Y). +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshot pairs +! (the number of columns of X and Y). +!..... +! X (input/output) REAL(KIND=WP) M-by-N array +! > On entry, X contains the data snapshot matrix X. It is +! assumed that the column norms of X are in the range of +! the normalized floating point numbers. +! < On exit, the leading K columns of X contain a POD basis, +! i.e. the leading K left singular vectors of the input +! data matrix X, U(:,1:K). All N columns of X contain all +! left singular vectors of the input matrix X. +! See the descriptions of K, Z and W. +!..... +! LDX (input) INTEGER, LDX >= M +! The leading dimension of the array X. +!..... +! Y (input/workspace/output) REAL(KIND=WP) M-by-N array +! > On entry, Y contains the data snapshot matrix Y +! < On exit, +! If JOBR == 'R', the leading K columns of Y contain +! the residual vectors for the computed Ritz pairs. +! See the description of RES. +! If JOBR == 'N', Y contains the original input data, +! scaled according to the value of JOBS. +!..... +! LDY (input) INTEGER , LDY >= M +! The leading dimension of the array Y. +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the descriptions of TOL and K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the POD basis for the data snapshot +! matrix X and the number of the computed Ritz pairs. +! The value of K is determined according to the rule set +! by the parameters NRNK and TOL. +! See the descriptions of NRNK and TOL. +!..... +! REIG (output) REAL(KIND=WP) N-by-1 array +! The leading K (K<=N) entries of REIG contain +! the real parts of the computed eigenvalues +! REIG(1:K) + sqrt(-1)*IMEIG(1:K). +! See the descriptions of K, IMEIG, and Z. +!..... +! IMEIG (output) REAL(KIND=WP) N-by-1 array +! The leading K (K<=N) entries of IMEIG contain +! the imaginary parts of the computed eigenvalues +! REIG(1:K) + sqrt(-1)*IMEIG(1:K). +! The eigenvalues are determined as follows: +! If IMEIG(i) == 0, then the corresponding eigenvalue is +! real, LAMBDA(i) = REIG(i). +! If IMEIG(i)>0, then the corresponding complex +! conjugate pair of eigenvalues reads +! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +! That is, complex conjugate pairs have consecutive +! indices (i,i+1), with the positive imaginary part +! listed first. +! See the descriptions of K, REIG, and Z. +!..... +! Z (workspace/output) REAL(KIND=WP) M-by-N array +! If JOBZ =='V' then +! Z contains real Ritz vectors as follows: +! If IMEIG(i)=0, then Z(:,i) is an eigenvector of +! the i-th Ritz value; ||Z(:,i)||_2=1. +! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +! [Z(:,i) Z(:,i+1)] span an invariant subspace and +! the Ritz values extracted from this subspace are +! REIG(i) + sqrt(-1)*IMEIG(i) and +! REIG(i) - sqrt(-1)*IMEIG(i). +! The corresponding eigenvectors are +! Z(:,i) + sqrt(-1)*Z(:,i+1) and +! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +! || Z(:,i:i+1)||_F = 1. +! If JOBZ == 'F', then the above descriptions hold for +! the columns of X(:,1:K)*W(1:K,1:K), where the columns +! of W(1:k,1:K) are the computed eigenvectors of the +! K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +! are similarly structured: If IMEIG(i) == 0 then +! X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +! then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +! X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +! See the descriptions of REIG, IMEIG, X and W. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) N-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs. +! If LAMBDA(i) is real, then +! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +! then +! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +! It holds that +! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +! See the description of REIG, IMEIG and Z. +!..... +! B (output) REAL(KIND=WP) M-by-N array. +! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:M,1;K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! If JOBF =='N', then B is not referenced. +! See the descriptions of X, W, K. +!..... +! LDB (input) INTEGER, LDB >= M +! The leading dimension of the array B. +!..... +! W (workspace/output) REAL(KIND=WP) N-by-N array +! On exit, W(1:K,1:K) contains the K computed +! eigenvectors of the matrix Rayleigh quotient (real and +! imaginary parts for each complex conjugate pair of the +! eigenvalues). The Ritz vectors (returned in Z) are the +! product of X (containing a POD basis for the input +! matrix X) and W. See the descriptions of K, S, X and Z. +! W is also used as a workspace to temporarily store the +! left singular vectors of X. +!..... +! LDW (input) INTEGER, LDW >= N +! The leading dimension of the array W. +!..... +! S (workspace/output) REAL(KIND=WP) N-by-N array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by SGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N +! The leading dimension of the array S. +!..... +! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +! On exit, WORK(1:N) contains the singular values of +! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +! If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +! scaling factor WORK(N+2)/WORK(N+1) used to scale X +! and Y to avoid overflow in the SVD of X. +! This may be of interest if the scaling option is off +! and as many as possible smallest eigenvalues are +! desired to the highest feasible accuracy. +! If the call to SGEDMD is only workspace query, then +! WORK(1) contains the minimal workspace length and +! WORK(2) is the optimal workspace length. Hence, the +! length of work is at least 2. +! See the description of LWORK. +!..... +! LWORK (input) INTEGER +! The minimal length of the workspace vector WORK. +! LWORK is calculated as follows: +! If WHTSVD == 1 :: +! If JOBZ == 'V', then +! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +! If JOBZ == 'N' then +! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +! Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +! workspace length of SGESVD. +! If WHTSVD == 2 :: +! If JOBZ == 'V', then +! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +! If JOBZ == 'N', then +! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +! Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +! minimal workspace length of SGESDD. +! If WHTSVD == 3 :: +! If JOBZ == 'V', then +! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +! If JOBZ == 'N', then +! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +! Here LWORK_SVD = N+M+MAX(3*N+1, +! MAX(1,3*N+M,5*N),MAX(1,N)) +! is the minimal workspace length of SGESVDQ. +! If WHTSVD == 4 :: +! If JOBZ == 'V', then +! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +! If JOBZ == 'N', then +! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +! Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +! minimal workspace length of SGEJSV. +! The above expressions are not simplified in order to +! make the usage of WORK more transparent, and for +! easier checking. In any case, LWORK >= 2. +! If on entry LWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +! If on entry LIWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + +! Local scalars +! ~~~~~~~~~~~~~ + REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & + SSUM, XSCL1, XSCL2 + INTEGER :: i, j, IMINWR, INFO1, INFO2, & + LWRKEV, LWRSDD, LWRSVD, & + LWRSVQ, MLWORK, MWRKEV, MWRSDD, & + MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, & + OLWORK + LOGICAL :: BADXY, LQUERY, SCCOLX, SCCOLY, & + WNTEX, WNTREF, WNTRES, WNTVEC + CHARACTER :: JOBZL, T_OR_N + CHARACTER :: JSVOPT + +! Local arrays +! ~~~~~~~~~~~~ + REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) + +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + REAL(KIND=WP) SLANGE, SLAMCH, SNRM2 + EXTERNAL SLANGE, SLAMCH, SNRM2, ISAMAX + INTEGER ISAMAX + LOGICAL SISNAN, LSAME + EXTERNAL SISNAN, LSAME + +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL SAXPY, SGEMM, SSCAL + EXTERNAL SGEEV, SGEJSV, SGESDD, SGESVD, SGESVDQ, & + SLACPY, SLASCL, SLASSQ, XERBLA + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC INT, FLOAT, MAX, SQRT +!............................................................ +! +! Test the input arguments +! + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C') + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + INFO = 0 + LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N') & + .OR. LSAME(JOBZ,'F')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. (.NOT.WNTVEC) ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -4 + ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR. & + (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN + INFO = -5 + ELSE IF ( M < 0 ) THEN + INFO = -6 + ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN + INFO = -7 + ELSE IF ( LDX < M ) THEN + INFO = -9 + ELSE IF ( LDY < M ) THEN + INFO = -11 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -12 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -13 + ELSE IF ( LDZ < M ) THEN + INFO = -18 + ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN + INFO = -21 + ELSE IF ( LDW < N ) THEN + INFO = -23 + ELSE IF ( LDS < N ) THEN + INFO = -25 + END IF +! + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( N == 0 ) THEN + ! Quick return. All output except K is void. + ! INFO=1 signals the void input. + ! In case of a workspace query, the default + ! minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + MLWORK = MAX(2,N) + OLWORK = MAX(2,N) + IMINWR = 1 + SELECT CASE ( WHTSVD ) + CASE (1) + ! The following is specified as the minimal + ! length of WORK in the definition of SGESVD: + ! MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + MLWORK = MAX(MLWORK,N + MWRSVD) + IF ( LQUERY ) THEN + CALL SGESVD( 'O', 'S', M, N, X, LDX, WORK, & + B, LDB, W, LDW, RDUMMY, -1, INFO1 ) + LWRSVD = MAX( MWRSVD, INT( RDUMMY(1) ) ) + OLWORK = MAX(OLWORK,N + LWRSVD) + END IF + CASE (2) + ! The following is specified as the minimal + ! length of WORK in the definition of SGESDD: + ! MWRSDD = 3*MIN(M,N)*MIN(M,N) + + ! MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) + ! IMINWR = 8*MIN(M,N) + MWRSDD = 3*MIN(M,N)*MIN(M,N) + & + MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) + MLWORK = MAX(MLWORK,N + MWRSDD) + IMINWR = 8*MIN(M,N) + IF ( LQUERY ) THEN + CALL SGESDD( 'O', M, N, X, LDX, WORK, B, & + LDB, W, LDW, RDUMMY, -1, IWORK, INFO1 ) + LWRSDD = MAX( MWRSDD, INT( RDUMMY(1) ) ) + OLWORK = MAX(OLWORK,N + LWRSDD) + END IF + CASE (3) + !LWQP3 = 3*N+1 + !LWORQ = MAX(N, 1) + !MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) + !MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ )+ MAX(M,2) + !MLWORK = N + MWRSVQ + !IMINWR = M+N-1 + CALL SGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, WORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, -1, RDUMMY, & + -1, RDUMMY2, -1, INFO1 ) + IMINWR = IWORK(1) + MWRSVQ = INT(RDUMMY(2)) + MLWORK = MAX(MLWORK,N+MWRSVQ+INT(RDUMMY2(1))) + IF ( LQUERY ) THEN + LWRSVQ = INT(RDUMMY(1)) + OLWORK = MAX(OLWORK,N+LWRSVQ+INT(RDUMMY2(1))) + END IF + CASE (4) + JSVOPT = 'J' + !MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N )! for JSVOPT='V' + MWRSVJ = MAX( 7, 2*M+N, 4*N+N*N, 2*N+N*N+6 ) + MLWORK = MAX(MLWORK,N+MWRSVJ) + IMINWR = MAX( 3, M+3*N ) + IF ( LQUERY ) THEN + OLWORK = MAX(OLWORK,N+MWRSVJ) + END IF + END SELECT + IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN + JOBZL = 'V' + ELSE + JOBZL = 'N' + END IF + ! Workspace calculation to the SGEEV call + IF ( LSAME(JOBZL,'V') ) THEN + MWRKEV = MAX( 1, 4*N ) + ELSE + MWRKEV = MAX( 1, 3*N ) + END IF + MLWORK = MAX(MLWORK,N+MWRKEV) + IF ( LQUERY ) THEN + CALL SGEEV( 'N', JOBZL, N, S, LDS, REIG, & + IMEIG, W, LDW, W, LDW, RDUMMY, -1, INFO1 ) + LWRKEV = MAX( MWRKEV, INT(RDUMMY(1)) ) + OLWORK = MAX( OLWORK, N+LWRKEV ) + END IF +! + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -29 + IF ( LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -27 + END IF +! + IF( INFO /= 0 ) THEN + CALL XERBLA( 'SGEDMD', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + WORK(1) = MLWORK + WORK(2) = OLWORK + RETURN + END IF +!............................................................ +! + OFL = SLAMCH('O') + SMALL = SLAMCH('S') + BADXY = .FALSE. +! +! <1> Optional scaling of the snapshots (columns of X, Y) +! ========================================================== + IF ( SCCOLX ) THEN + ! The columns of X will be normalized. + ! To prevent overflows, the column norms of X are + ! carefully computed using SLASSQ. + K = 0 + DO i = 1, N + !WORK(i) = DNRM2( M, X(1,i), 1 ) + SCALE = ZERO + CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM ) + IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN + K = 0 + INFO = -8 + CALL XERBLA('SGEDMD',-INFO) + END IF + IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of X(:,i) overflows. First, X(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. +! Next, the norm of X(:,i) is stored without +! overflow as WORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of X(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL SLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, X(1,i), M, INFO2 ) + WORK(i) = - SCALE * ( ROOTSC / FLOAT(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + WORK(i) = SCALE * ROOTSC + CALL SLASCL( 'G',0, 0, WORK(i), ONE, M, 1, & + X(1,i), M, INFO2 ) ! LAPACK CALL +! X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC + END IF + ELSE + WORK(i) = ZERO + K = K + 1 + END IF + END DO + IF ( K == N ) THEN + ! All columns of X are zero. Return error code -8. + ! (the 8th input variable had an illegal value) + K = 0 + INFO = -8 + CALL XERBLA('SGEDMD',-INFO) + RETURN + END IF + DO i = 1, N +! Now, apply the same scaling to the columns of Y. + IF ( WORK(i) > ZERO ) THEN + CALL SSCAL( M, ONE/WORK(i), Y(1,i), 1 ) ! BLAS CALL +! Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC + ELSE IF ( WORK(i) < ZERO ) THEN + CALL SLASCL( 'G', 0, 0, -WORK(i), & + ONE/FLOAT(M), M, 1, Y(1,i), M, INFO2 ) ! LAPACK CALL + ELSE IF ( Y(ISAMAX(M, Y(1,i),1),i ) & + /= ZERO ) THEN +! X(:,i) is zero vector. For consistency, +! Y(:,i) should also be zero. If Y(:,i) is not +! zero, then the data might be inconsistent or +! corrupted. If JOBS == 'C', Y(:,i) is set to +! zero and a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + IF ( LSAME(JOBS,'C')) & + CALL SSCAL( M, ZERO, Y(1,i), 1 ) ! BLAS CALL + END IF + END DO + END IF + ! + IF ( SCCOLY ) THEN + ! The columns of Y will be normalized. + ! To prevent overflows, the column norms of Y are + ! carefully computed using SLASSQ. + DO i = 1, N + !WORK(i) = DNRM2( M, Y(1,i), 1 ) + SCALE = ZERO + CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM ) + IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN + K = 0 + INFO = -10 + CALL XERBLA('SGEDMD',-INFO) + END IF + IF ( SCALE /= ZERO .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of Y(:,i) overflows. First, Y(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. +! Next, the norm of Y(:,i) is stored without +! overflow as WORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of Y(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL SLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, Y(1,i), M, INFO2 ) + WORK(i) = - SCALE * ( ROOTSC / FLOAT(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + WORK(i) = SCALE * ROOTSC + CALL SLASCL( 'G',0, 0, WORK(i), ONE, M, 1, & + Y(1,i), M, INFO2 ) ! LAPACK CALL +! Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC + END IF + ELSE + WORK(i) = ZERO + END IF + END DO + DO i = 1, N +! Now, apply the same scaling to the columns of X. + IF ( WORK(i) > ZERO ) THEN + CALL SSCAL( M, ONE/WORK(i), X(1,i), 1 ) ! BLAS CALL +! X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC + ELSE IF ( WORK(i) < ZERO ) THEN + CALL SLASCL( 'G', 0, 0, -WORK(i), & + ONE/FLOAT(M), M, 1, X(1,i), M, INFO2 ) ! LAPACK CALL + ELSE IF ( X(ISAMAX(M, X(1,i),1),i ) & + /= ZERO ) THEN +! Y(:,i) is zero vector. If X(:,i) is not +! zero, then a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + END IF + END DO + END IF +! +! <2> SVD of the data snapshot matrix X. +! ===================================== +! The left singular vectors are stored in the array X. +! The right singular vectors are in the array W. +! The array W will later on contain the eigenvectors +! of a Rayleigh quotient. + NUMRNK = N + SELECT CASE ( WHTSVD ) + CASE (1) + CALL SGESVD( 'O', 'S', M, N, X, LDX, WORK, B, & + LDB, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL + T_OR_N = 'T' + CASE (2) + CALL SGESDD( 'O', M, N, X, LDX, WORK, B, LDB, W, & + LDW, WORK(N+1), LWORK-N, IWORK, INFO1 ) ! LAPACK CALL + T_OR_N = 'T' + CASE (3) + CALL SGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, WORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, LIWORK, WORK(N+MAX(2,M)+1),& + LWORK-N-MAX(2,M), WORK(N+1), MAX(2,M), INFO1) ! LAPACK CALL + CALL SLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'T' + CASE (4) + CALL SGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, & + N, X, LDX, WORK, Z, LDZ, W, LDW, & + WORK(N+1), LWORK-N, IWORK, INFO1 ) ! LAPACK CALL + CALL SLACPY( 'A', M, N, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'N' + XSCL1 = WORK(N+1) + XSCL2 = WORK(N+2) + IF ( XSCL1 /= XSCL2 ) THEN + ! This is an exceptional situation. If the + ! data matrices are not scaled and the + ! largest singular value of X overflows. + ! In that case SGEJSV can return the SVD + ! in scaled form. The scaling factor can be used + ! to rescale the data (X and Y). + CALL SLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2 ) + END IF + END SELECT +! + IF ( INFO1 > 0 ) THEN + ! The SVD selected subroutine did not converge. + ! Return with an error code. + INFO = 2 + RETURN + END IF +! + IF ( WORK(1) == ZERO ) THEN + ! The largest computed singular value of (scaled) + ! X is zero. Return error code -8 + ! (the 8th input variable had an illegal value). + K = 0 + INFO = -8 + CALL XERBLA('SGEDMD',-INFO) + RETURN + END IF +! + !<3> Determine the numerical rank of the data + ! snapshots matrix X. This depends on the + ! parameters NRNK and TOL. + + SELECT CASE ( NRNK ) + CASE ( -1 ) + K = 1 + DO i = 2, NUMRNK + IF ( ( WORK(i) <= WORK(1)*TOL ) .OR. & + ( WORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE ( -2 ) + K = 1 + DO i = 1, NUMRNK-1 + IF ( ( WORK(i+1) <= WORK(i)*TOL ) .OR. & + ( WORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE DEFAULT + K = 1 + DO i = 2, NRNK + IF ( WORK(i) <= SMALL ) EXIT + K = K + 1 + END DO + END SELECT + ! Now, U = X(1:M,1:K) is the SVD/POD basis for the + ! snapshot data in the input matrix X. + + !<4> Compute the Rayleigh quotient S = U^T * A * U. + ! Depending on the requested outputs, the computation + ! is organized to compute additional auxiliary + ! matrices (for the residuals and refinements). + ! + ! In all formulas below, we need V_k*Sigma_k^(-1) + ! where either V_k is in W(1:N,1:K), or V_k^T is in + ! W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). + IF ( LSAME(T_OR_N, 'N') ) THEN + DO i = 1, K + CALL SSCAL( N, ONE/WORK(i), W(1,i), 1 ) ! BLAS CALL + ! W(1:N,i) = (ONE/WORK(i)) * W(1:N,i) ! INTRINSIC + END DO + ELSE + ! This non-unit stride access is due to the fact + ! that SGESVD, SGESVDQ and SGESDD return the + ! transposed matrix of the right singular vectors. + !DO i = 1, K + ! CALL SSCAL( N, ONE/WORK(i), W(i,1), LDW ) ! BLAS CALL + ! ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N) ! INTRINSIC + !END DO + DO i = 1, K + WORK(N+i) = ONE/WORK(i) + END DO + DO j = 1, N + DO i = 1, K + W(i,j) = (WORK(N+i))*W(i,j) + END DO + END DO + END IF +! + IF ( WNTREF ) THEN + ! + ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) + ! for computing the refined Ritz vectors + ! (optionally, outside SGEDMD). + CALL SGEMM( 'N', T_OR_N, M, K, N, ONE, Y, LDY, W, & + LDW, ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRINSIC, for T_OR_N=='N' + ! + ! At this point Z contains + ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and + ! this is needed for computing the residuals. + ! This matrix is returned in the array B and + ! it can be used to compute refined Ritz vectors. + CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) ! BLAS CALL + ! B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC + + CALL SGEMM( 'T', 'N', K, K, M, ONE, X, LDX, Z, & + LDZ, ZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC + ! At this point S = U^T * A * U is the Rayleigh quotient. + ELSE + ! A * U(:,1:K) is not explicitly needed and the + ! computation is organized differently. The Rayleigh + ! quotient is computed more efficiently. + CALL SGEMM( 'T', 'N', K, N, M, ONE, X, LDX, Y, LDY, & + ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! INTRINSIC + ! In the two SGEMM calls here, can use K for LDZ + CALL SGEMM( 'N', T_OR_N, K, K, N, ONE, Z, LDZ, W, & + LDW, ZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T' + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRINSIC, for T_OR_N=='N' + ! At this point S = U^T * A * U is the Rayleigh quotient. + ! If the residuals are requested, save scaled V_k into Z. + ! Recall that V_k or V_k^T is stored in W. + IF ( WNTRES .OR. WNTEX ) THEN + IF ( LSAME(T_OR_N, 'N') ) THEN + CALL SLACPY( 'A', N, K, W, LDW, Z, LDZ ) + ELSE + CALL SLACPY( 'A', K, N, W, LDW, Z, LDZ ) + END IF + END IF + END IF +! + !<5> Compute the Ritz values and (if requested) the + ! right eigenvectors of the Rayleigh quotient. + ! + CALL SGEEV( 'N', JOBZL, K, S, LDS, REIG, IMEIG, W, & + LDW, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL + ! + ! W(1:K,1:K) contains the eigenvectors of the Rayleigh + ! quotient. Even in the case of complex spectrum, all + ! computation is done in real arithmetic. REIG and + ! IMEIG are the real and the imaginary parts of the + ! eigenvalues, so that the spectrum is given as + ! REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs + ! are listed at consecutive positions. For such a + ! complex conjugate pair of the eigenvalues, the + ! corresponding eigenvectors are also a complex + ! conjugate pair with the real and imaginary parts + ! stored column-wise in W at the corresponding + ! consecutive column indices. See the description of Z. + ! Also, see the description of SGEEV. + IF ( INFO1 > 0 ) THEN + ! SGEEV failed to compute the eigenvalues and + ! eigenvectors of the Rayleigh quotient. + INFO = 3 + RETURN + END IF +! + ! <6> Compute the eigenvectors (if requested) and, + ! the residuals (if requested). + ! + IF ( WNTVEC .OR. WNTEX ) THEN + IF ( WNTRES ) THEN + IF ( WNTREF ) THEN + ! Here, if the refinement is requested, we have + ! A*U(:,1:K) already computed and stored in Z. + ! For the residuals, need Y = A * U(:,1;K) * W. + CALL SGEMM( 'N', 'N', M, K, K, ONE, Z, LDZ, W, & + LDW, ZERO, Y, LDY ) ! BLAS CALL + ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC + ! This frees Z; Y contains A * U(:,1:K) * W. + ELSE + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL SGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, & + W, LDW, ZERO, S, LDS ) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + LDS, ZERO, Z, LDZ ) + ! Save a copy of Z into Y and free Z for holding + ! the Ritz vectors. + CALL SLACPY( 'A', M, K, Z, LDZ, Y, LDY ) + IF ( WNTEX ) CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF + ELSE IF ( WNTEX ) THEN + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL SGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, & + W, LDW, ZERO, S, LDS ) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + LDS, ZERO, B, LDB ) + ! The above call replaces the following two calls + ! that were used in the developing-testing phase. + ! CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & + ! LDS, ZERO, Z, LDZ) + ! Save a copy of Z into B and free Z for holding + ! the Ritz vectors. + ! CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF +! + ! Compute the real form of the Ritz vectors + IF ( WNTVEC ) CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, & + ZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC +! + IF ( WNTRES ) THEN + i = 1 + DO WHILE ( i <= K ) + IF ( IMEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y(1,i), 1 ) ! BLAS CALL + ! Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i) ! INTRINSIC + RES(i) = SNRM2( M, Y(1,i), 1 ) ! BLAS CALL + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IMEIG(i) + AB(1,2) = IMEIG(i) + AB(2,2) = REIG(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + LDZ, AB, 2, ONE, Y(1,i), LDY ) ! BLAS CALL + ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INTRINSIC + RES(i) = SLANGE( 'F', M, 2, Y(1,i), LDY, & + WORK(N+1) ) ! LAPACK CALL + RES(i+1) = RES(i) + i = i + 2 + END IF + END DO + END IF + END IF +! + IF ( WHTSVD == 4 ) THEN + WORK(N+1) = XSCL1 + WORK(N+2) = XSCL2 + END IF +! +! Successful exit. + IF ( .NOT. BADXY ) THEN + INFO = 0 + ELSE + ! A warning on possible data inconsistency. + ! This should be a rare event. + INFO = 4 + END IF +!............................................................ + RETURN +! ...... + END SUBROUTINE SGEDMD + diff --git a/lapack-netlib/SRC/sgedmdq.f90 b/lapack-netlib/SRC/sgedmdq.f90 new file mode 100644 index 000000000..acd5d56c6 --- /dev/null +++ b/lapack-netlib/SRC/sgedmdq.f90 @@ -0,0 +1,703 @@ +SUBROUTINE SGEDMDQ( JOBS, JOBZ, JOBR, JOBQ, JOBT, JOBF, & + WHTSVD, M, N, F, LDF, X, LDX, Y, & + LDY, NRNK, TOL, K, REIG, IMEIG, & + Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, WORK, LWORK, IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBQ, & + JOBT, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDF, LDX, & + LDY, NRNK, LDZ, LDB, LDV, & + LDS, LWORK, LIWORK + INTEGER, INTENT(OUT) :: INFO, K + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + REAL(KIND=WP), INTENT(INOUT) :: F(LDF,*) + REAL(KIND=WP), INTENT(OUT) :: X(LDX,*), Y(LDY,*), & + Z(LDZ,*), B(LDB,*), & + V(LDV,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!..... +! Purpose +! ======= +! SGEDMDQ computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices, using a QR factorization +! based compression of the data. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, SGEDMDQ computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, SGEDMDQ returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office. +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!...................................................................... +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. The data snapshots are the columns +! of F. The leading N-1 columns of F are denoted X and the +! trailing N-1 columns are denoted Y. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Z*V, where Z +! is orthonormal and V contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of F, V, Z. +! 'Q' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Q*Z, where Z +! contains the eigenvectors of the compression of the +! underlying discretized operator onto the span of +! the data snapshots. See the descriptions of F, V, Z. +! Q is from the initial QR factorization. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will +! be computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBQ (input) CHARACTER*1 +! Specifies whether to explicitly compute and return the +! orthogonal matrix from the QR factorization. +! 'Q' :: The matrix Q of the QR factorization of the data +! snapshot matrix is computed and stored in the +! array F. See the description of F. +! 'N' :: The matrix Q is not explicitly computed. +!..... +! JOBT (input) CHARACTER*1 +! Specifies whether to return the upper triangular factor +! from the QR factorization. +! 'R' :: The matrix R of the QR factorization of the data +! snapshot matrix F is returned in the array Y. +! See the description of Y and Further details. +! 'N' :: The matrix R is not returned. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +! To be useful on exit, this option needs JOBQ='Q'. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: SGESVD (the QR SVD algorithm) +! 2 :: SGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M >= 0 +! The state space dimension (the number of rows of F) +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshots from a single trajectory, +! taken at equidistant discrete times. This is the +! number of columns of F. +!..... +! F (input/output) REAL(KIND=WP) M-by-N array +! > On entry, +! the columns of F are the sequence of data snapshots +! from a single trajectory, taken at equidistant discrete +! times. It is assumed that the column norms of F are +! in the range of the normalized floating point numbers. +! < On exit, +! If JOBQ == 'Q', the array F contains the orthogonal +! matrix/factor of the QR factorization of the initial +! data snapshots matrix F. See the description of JOBQ. +! If JOBQ == 'N', the entries in F strictly below the main +! diagonal contain, column-wise, the information on the +! Householder vectors, as returned by SGEQRF. The +! remaining information to restore the orthogonal matrix +! of the initial QR factorization is stored in WORK(1:N). +! See the description of WORK. +!..... +! LDF (input) INTEGER, LDF >= M +! The leading dimension of the array F. +!..... +! X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array +! X is used as workspace to hold representations of the +! leading N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, the leading K columns of X contain the leading +! K left singular vectors of the above described content +! of X. To lift them to the space of the left singular +! vectors U(:,1:K)of the input data, pre-multiply with the +! Q factor from the initial QR factorization. +! See the descriptions of F, K, V and Z. +!..... +! LDX (input) INTEGER, LDX >= N +! The leading dimension of the array X +!..... +! Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array +! Y is used as workspace to hold representations of the +! trailing N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, +! If JOBT == 'R', Y contains the MIN(M,N)-by-N upper +! triangular factor from the QR factorization of the data +! snapshot matrix F. +!..... +! LDY (input) INTEGER , LDY >= N +! The leading dimension of the array Y +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N-1 :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the description of K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the SVD/POD basis for the leading N-1 +! data snapshots (columns of F) and the number of the +! computed Ritz pairs. The value of K is determined +! according to the rule set by the parameters NRNK and +! TOL. See the descriptions of NRNK and TOL. +!..... +! REIG (output) REAL(KIND=WP) (N-1)-by-1 array +! The leading K (K<=N) entries of REIG contain +! the real parts of the computed eigenvalues +! REIG(1:K) + sqrt(-1)*IMEIG(1:K). +! See the descriptions of K, IMEIG, Z. +!..... +! IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array +! The leading K (K0, then the corresponding complex +! conjugate pair of eigenvalues reads +! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +! That is, complex conjugate pairs have consecutive +! indices (i,i+1), with the positive imaginary part +! listed first. +! See the descriptions of K, REIG, Z. +!..... +! Z (workspace/output) REAL(KIND=WP) M-by-(N-1) array +! If JOBZ =='V' then +! Z contains real Ritz vectors as follows: +! If IMEIG(i)=0, then Z(:,i) is an eigenvector of +! the i-th Ritz value. +! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +! [Z(:,i) Z(:,i+1)] span an invariant subspace and +! the Ritz values extracted from this subspace are +! REIG(i) + sqrt(-1)*IMEIG(i) and +! REIG(i) - sqrt(-1)*IMEIG(i). +! The corresponding eigenvectors are +! Z(:,i) + sqrt(-1)*Z(:,i+1) and +! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +! If JOBZ == 'F', then the above descriptions hold for +! the columns of Z*V, where the columns of V are the +! eigenvectors of the K-by-K Rayleigh quotient, and Z is +! orthonormal. The columns of V are similarly structured: +! If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if +! IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and +! Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) +! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +! See the descriptions of REIG, IMEIG, X and V. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) (N-1)-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs. +! If LAMBDA(i) is real, then +! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +! then +! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +! It holds that +! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +! See the description of Z. +!..... +! B (output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array. +! IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:N,1;K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! In both cases, the content of B can be lifted to the +! original dimension of the input data by pre-multiplying +! with the Q factor from the initial QR factorization. +! Here A denotes a compression of the underlying operator. +! See the descriptions of F and X. +! If JOBF =='N', then B is not referenced. +!..... +! LDB (input) INTEGER, LDB >= MIN(M,N) +! The leading dimension of the array B. +!..... +! V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array +! On exit, V(1:K,1:K) contains the K eigenvectors of +! the Rayleigh quotient. The eigenvectors of a complex +! conjugate pair of eigenvalues are returned in real form +! as explained in the description of Z. The Ritz vectors +! (returned in Z) are the product of X and V; see +! the descriptions of X and Z. +!..... +! LDV (input) INTEGER, LDV >= N-1 +! The leading dimension of the array V. +!..... +! S (output) REAL(KIND=WP) (N-1)-by-(N-1) array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by SGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N-1 +! The leading dimension of the array S. +!..... +! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +! On exit, +! WORK(1:MIN(M,N)) contains the scalar factors of the +! elementary reflectors as returned by SGEQRF of the +! M-by-N input matrix F. +! WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of +! the input submatrix F(1:M,1:N-1). +! If the call to SGEDMDQ is only workspace query, then +! WORK(1) contains the minimal workspace length and +! WORK(2) is the optimal workspace length. Hence, the +! length of work is at least 2. +! See the description of LWORK. +!..... +! LWORK (input) INTEGER +! The minimal length of the workspace vector WORK. +! LWORK is calculated as follows: +! Let MLWQR = N (minimal workspace for SGEQRF[M,N]) +! MLWDMD = minimal workspace for SGEDMD (see the +! description of LWORK in SGEDMD) for +! snapshots of dimensions MIN(M,N)-by-(N-1) +! MLWMQR = N (minimal workspace for +! SORMQR['L','N',M,N,N]) +! MLWGQR = N (minimal workspace for SORGQR[M,N,N]) +! Then +! LWORK = MAX(N+MLWQR, N+MLWDMD) +! is updated as follows: +! if JOBZ == 'V' or JOBZ == 'F' THEN +! LWORK = MAX( LWORK,MIN(M,N)+N-1 +MLWMQR ) +! if JOBQ == 'Q' THEN +! LWORK = MAX( LWORK,MIN(M,N)+N-1+MLWGQR) +! If on entry LWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! Let M1=MIN(M,N), N1=N-1. Then +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) +! If on entry LIWORK = -1, then a worskpace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +! +! Local scalars +! ~~~~~~~~~~~~~ + INTEGER :: IMINWR, INFO1, MLWDMD, MLWGQR, & + MLWMQR, MLWORK, MLWQR, MINMN, & + OLWDMD, OLWGQR, OLWMQR, OLWORK, & + OLWQR + LOGICAL :: LQUERY, SCCOLX, SCCOLY, WANTQ, & + WNTTRF, WNTRES, WNTVEC, WNTVCF, & + WNTVCQ, WNTREF, WNTEX + CHARACTER(LEN=1) :: JOBVL +! +! Local array +! ~~~~~~~~~~~ + REAL(KIND=WP) :: RDUMMY(2) +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + LOGICAL LSAME + EXTERNAL LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL SGEMM + EXTERNAL SGEQRF, SLACPY, SLASET, SORGQR, & + SORMQR, XERBLA + +! External subroutines +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL SGEDMD + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC MAX, MIN, INT + !.......................................................... + ! + ! Test the input arguments + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' ) + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTVCF = LSAME(JOBZ,'F') + WNTVCQ = LSAME(JOBZ,'Q') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + WANTQ = LSAME(JOBQ,'Q') + WNTTRF = LSAME(JOBT,'R') + MINMN = MIN(M,N) + INFO = 0 + LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ & + .OR. LSAME(JOBZ,'N')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN + INFO = -4 + ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) ) THEN + INFO = -5 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -6 + ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR. & + (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN + INFO = -7 + ELSE IF ( M < 0 ) THEN + INFO = -8 + ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN + INFO = -9 + ELSE IF ( LDF < M ) THEN + INFO = -11 + ELSE IF ( LDX < MINMN ) THEN + INFO = -13 + ELSE IF ( LDY < MINMN ) THEN + INFO = -15 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -16 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -17 + ELSE IF ( LDZ < M ) THEN + INFO = -22 + ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN + INFO = -25 + ELSE IF ( LDV < N-1 ) THEN + INFO = -27 + ELSE IF ( LDS < N-1 ) THEN + INFO = -29 + END IF +! + IF ( WNTVEC .OR. WNTVCF ) THEN + JOBVL = 'V' + ELSE + JOBVL = 'N' + END IF + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN + ! All output except K is void. INFO=1 signals + ! the void input. In case of a workspace query, + ! the minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + MLWQR = MAX(1,N) ! Minimal workspace length for SGEQRF. + MLWORK = MIN(M,N) + MLWQR + IF ( LQUERY ) THEN + CALL SGEQRF( M, N, F, LDF, WORK, RDUMMY, -1, & + INFO1 ) + OLWQR = INT(RDUMMY(1)) + OLWORK = MIN(M,N) + OLWQR + END IF + CALL SGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + REIG, IMEIG, Z, LDZ, RES, B, LDB, & + V, LDV, S, LDS, WORK, -1, IWORK, & + LIWORK, INFO1 ) + MLWDMD = INT(WORK(1)) + MLWORK = MAX(MLWORK, MINMN + MLWDMD) + IMINWR = IWORK(1) + IF ( LQUERY ) THEN + OLWDMD = INT(WORK(2)) + OLWORK = MAX(OLWORK, MINMN+OLWDMD) + END IF + IF ( WNTVEC .OR. WNTVCF ) THEN + MLWMQR = MAX(1,N) + MLWORK = MAX(MLWORK,MINMN+N-1+MLWMQR) + IF ( LQUERY ) THEN + CALL SORMQR( 'L','N', M, N, MINMN, F, LDF, & + WORK, Z, LDZ, WORK, -1, INFO1 ) + OLWMQR = INT(WORK(1)) + OLWORK = MAX(OLWORK,MINMN+N-1+OLWMQR) + END IF + END IF + IF ( WANTQ ) THEN + MLWGQR = N + MLWORK = MAX(MLWORK,MINMN+N-1+MLWGQR) + IF ( LQUERY ) THEN + CALL SORGQR( M, MINMN, MINMN, F, LDF, WORK, & + WORK, -1, INFO1 ) + OLWGQR = INT(WORK(1)) + OLWORK = MAX(OLWORK,MINMN+N-1+OLWGQR) + END IF + END IF + IMINWR = MAX( 1, IMINWR ) + MLWORK = MAX( 2, MLWORK ) + IF ( LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -31 + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -33 + END IF + IF( INFO /= 0 ) THEN + CALL XERBLA( 'SGEDMDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + WORK(1) = MLWORK + WORK(2) = OLWORK + RETURN + END IF +!..... +! Initial QR factorization that is used to represent the +! snapshots as elements of lower dimensional subspace. +! For large scale computation with M >>N , at this place +! one can use an out of core QRF. +! + CALL SGEQRF( M, N, F, LDF, WORK, & + WORK(MINMN+1), LWORK-MINMN, INFO1 ) +! +! Define X and Y as the snapshots representations in the +! orthogonal basis computed in the QR factorization. +! X corresponds to the leading N-1 and Y to the trailing +! N-1 snapshots. + CALL SLASET( 'L', MINMN, N-1, ZERO, ZERO, X, LDX ) + CALL SLACPY( 'U', MINMN, N-1, F, LDF, X, LDX ) + CALL SLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY ) + IF ( M >= 3 ) THEN + CALL SLASET( 'L', MINMN-2, N-2, ZERO, ZERO, & + Y(3,1), LDY ) + END IF +! +! Compute the DMD of the projected snapshot pairs (X,Y) + CALL SGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, & + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + REIG, IMEIG, Z, LDZ, RES, B, LDB, V, & + LDV, S, LDS, WORK(MINMN+1), LWORK-MINMN, IWORK, & + LIWORK, INFO1 ) + IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN + ! Return with error code. + INFO = INFO1 + RETURN + ELSE + INFO = INFO1 + END IF +! +! The Ritz vectors (Koopman modes) can be explicitly +! formed or returned in factored form. + IF ( WNTVEC ) THEN + ! Compute the eigenvectors explicitly. + IF ( M > MINMN ) CALL SLASET( 'A', M-MINMN, K, ZERO, & + ZERO, Z(MINMN+1,1), LDZ ) + CALL SORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z, & + LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + ELSE IF ( WNTVCF ) THEN + ! Return the Ritz vectors (eigenvectors) in factored + ! form Z*V, where Z contains orthonormal matrix (the + ! product of Q from the initial QR factorization and + ! the SVD/POD_basis returned by SGEDMD in X) and the + ! second factor (the eigenvectors of the Rayleigh + ! quotient) is in the array V, as returned by SGEDMD. + CALL SLACPY( 'A', N, K, X, LDX, Z, LDZ ) + IF ( M > N ) CALL SLASET( 'A', M-N, K, ZERO, ZERO, & + Z(N+1,1), LDZ ) + CALL SORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z, & + LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + END IF +! +! Some optional output variables: +! +! The upper triangular factor in the initial QR +! factorization is optionally returned in the array Y. +! This is useful if this call to SGEDMDQ is to be +! followed by a streaming DMD that is implemented in a +! QR compressed form. + IF ( WNTTRF ) THEN ! Return the upper triangular R in Y + CALL SLASET( 'A', MINMN, N, ZERO, ZERO, Y, LDY ) + CALL SLACPY( 'U', MINMN, N, F, LDF, Y, LDY ) + END IF +! +! The orthonormal/orthogonal factor in the initial QR +! factorization is optionally returned in the array F. +! Same as with the triangular factor above, this is +! useful in a streaming DMD. + IF ( WANTQ ) THEN ! Q overwrites F + CALL SORGQR( M, MINMN, MINMN, F, LDF, WORK, & + WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 ) + END IF +! + RETURN +! + END SUBROUTINE SGEDMDQ + \ No newline at end of file diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90 new file mode 100644 index 000000000..090641ad8 --- /dev/null +++ b/lapack-netlib/SRC/zgedmd.f90 @@ -0,0 +1,996 @@ + SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, EIGS, Z, LDZ, RES, B, LDB, & + W, LDW, S, LDS, ZWORK, LZWORK, & + RWORK, LRWORK, IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 + +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LIWORK, LRWORK, LZWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: RWORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!............................................................ +! Purpose +! ======= +! ZGEDMD computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, ZGEDMD computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, ZGEDMD returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +!...................................................................... +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!............................................................ +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product X(:,1:K)*W, where X +! contains a POD basis (leading left singular vectors +! of the data matrix X) and W contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of K, X, W, Z. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will be +! computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: ZGESVD (the QR SVD algorithm) +! 2 :: ZGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M>= 0 +! The state space dimension (the row dimension of X, Y). +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshot pairs +! (the number of columns of X and Y). +!..... +! X (input/output) COMPLEX(KIND=WP) M-by-N array +! > On entry, X contains the data snapshot matrix X. It is +! assumed that the column norms of X are in the range of +! the normalized floating point numbers. +! < On exit, the leading K columns of X contain a POD basis, +! i.e. the leading K left singular vectors of the input +! data matrix X, U(:,1:K). All N columns of X contain all +! left singular vectors of the input matrix X. +! See the descriptions of K, Z and W. +!..... +! LDX (input) INTEGER, LDX >= M +! The leading dimension of the array X. +!..... +! Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +! > On entry, Y contains the data snapshot matrix Y +! < On exit, +! If JOBR == 'R', the leading K columns of Y contain +! the residual vectors for the computed Ritz pairs. +! See the description of RES. +! If JOBR == 'N', Y contains the original input data, +! scaled according to the value of JOBS. +!..... +! LDY (input) INTEGER , LDY >= M +! The leading dimension of the array Y. +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the descriptions of TOL and K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the POD basis for the data snapshot +! matrix X and the number of the computed Ritz pairs. +! The value of K is determined according to the rule set +! by the parameters NRNK and TOL. +! See the descriptions of NRNK and TOL. +!..... +! EIGS (output) COMPLEX(KIND=WP) N-by-1 array +! The leading K (K<=N) entries of EIGS contain +! the computed eigenvalues (Ritz values). +! See the descriptions of K, and Z. +!..... +! Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +! the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +! is an eigenvector corresponding to EIGS(i). The columns +! of W(1:k,1:K) are the computed eigenvectors of the +! K-by-K Rayleigh quotient. +! See the descriptions of EIGS, X and W. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) N-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs, +! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +! See the description of EIGS and Z. +!..... +! B (output) COMPLEX(KIND=WP) M-by-N array. +! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:M,1:K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! If JOBF =='N', then B is not referenced. +! See the descriptions of X, W, K. +!..... +! LDB (input) INTEGER, LDB >= M +! The leading dimension of the array B. +!..... +! W (workspace/output) COMPLEX(KIND=WP) N-by-N array +! On exit, W(1:K,1:K) contains the K computed +! eigenvectors of the matrix Rayleigh quotient. +! The Ritz vectors (returned in Z) are the +! product of X (containing a POD basis for the input +! matrix X) and W. See the descriptions of K, S, X and Z. +! W is also used as a workspace to temporarily store the +! right singular vectors of X. +!..... +! LDW (input) INTEGER, LDW >= N +! The leading dimension of the array W. +!..... +! S (workspace/output) COMPLEX(KIND=WP) N-by-N array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by ZGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N +! The leading dimension of the array S. +!..... +! ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +! ZWORK is used as complex workspace in the complex SVD, as +! specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing +! the eigenvalues of a Rayleigh quotient. +! If the call to ZGEDMD is only workspace query, then +! ZWORK(1) contains the minimal complex workspace length and +! ZWORK(2) is the optimal complex workspace length. +! Hence, the length of work is at least 2. +! See the description of LZWORK. +!..... +! LZWORK (input) INTEGER +! The minimal length of the workspace vector ZWORK. +! LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), +! where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal +! LZWORK_SVD is calculated as follows +! If WHTSVD == 1 :: ZGESVD :: +! LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +! If WHTSVD == 2 :: ZGESDD :: +! LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +! If WHTSVD == 3 :: ZGESVDQ :: +! LZWORK_SVD = obtainable by a query +! If WHTSVD == 4 :: ZGEJSV :: +! LZWORK_SVD = obtainable by a query +! If on entry LZWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths and returns them in +! LZWORK(1) and LZWORK(2), respectively. +!..... +! RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +! On exit, RWORK(1:N) contains the singular values of +! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +! If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +! scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +! and Y to avoid overflow in the SVD of X. +! This may be of interest if the scaling option is off +! and as many as possible smallest eigenvalues are +! desired to the highest feasible accuracy. +! If the call to ZGEDMD is only workspace query, then +! RWORK(1) contains the minimal workspace length. +! See the description of LRWORK. +!..... +! LRWORK (input) INTEGER +! The minimal length of the workspace vector RWORK. +! LRWORK is calculated as follows: +! LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where +! LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +! for the SVD subroutine determined by the input parameter +! WHTSVD. +! If WHTSVD == 1 :: ZGESVD :: +! LRWORK_SVD = 5*MIN(M,N) +! If WHTSVD == 2 :: ZGESDD :: +! LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +! 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +! If WHTSVD == 3 :: ZGESVDQ :: +! LRWORK_SVD = obtainable by a query +! If WHTSVD == 4 :: ZGEJSV :: +! LRWORK_SVD = obtainable by a query +! If on entry LRWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! real workspace length and returns it in RWORK(1). +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +! If on entry LIWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for ZWORK, RWORK and +! IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) + +! Local scalars +! ~~~~~~~~~~~~~ + REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & + SSUM, XSCL1, XSCL2 + INTEGER :: i, j, IMINWR, INFO1, INFO2, & + LWRKEV, LWRSDD, LWRSVD, LWRSVJ, & + LWRSVQ, MLWORK, MWRKEV, MWRSDD, & + MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, & + OLWORK, MLRWRK + LOGICAL :: BADXY, LQUERY, SCCOLX, SCCOLY, & + WNTEX, WNTREF, WNTRES, WNTVEC + CHARACTER :: JOBZL, T_OR_N + CHARACTER :: JSVOPT +! +! Local arrays +! ~~~~~~~~~~~~ + REAL(KIND=WP) :: RDUMMY(2) + +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2 + EXTERNAL ZLANGE, DLAMCH, DZNRM2, IZAMAX + INTEGER IZAMAX + LOGICAL DISNAN, LSAME + EXTERNAL DISNAN, LSAME + +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL ZAXPY, ZGEMM, ZDSCAL + EXTERNAL ZGEEV, ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, & + ZLACPY, ZLASCL, ZLASSQ, XERBLA + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC DBLE, INT, MAX, SQRT +!............................................................ +! +! Test the input arguments +! + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C') + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + INFO = 0 + LQUERY = ( ( LZWORK == -1 ) .OR. ( LIWORK == -1 ) & + .OR. ( LRWORK == -1 ) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N') & + .OR. LSAME(JOBZ,'F')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. (.NOT.WNTVEC) ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -4 + ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR. & + (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN + INFO = -5 + ELSE IF ( M < 0 ) THEN + INFO = -6 + ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN + INFO = -7 + ELSE IF ( LDX < M ) THEN + INFO = -9 + ELSE IF ( LDY < M ) THEN + INFO = -11 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -12 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -13 + ELSE IF ( LDZ < M ) THEN + INFO = -17 + ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN + INFO = -20 + ELSE IF ( LDW < N ) THEN + INFO = -22 + ELSE IF ( LDS < N ) THEN + INFO = -24 + END IF +! + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( N == 0 ) THEN + ! Quick return. All output except K is void. + ! INFO=1 signals the void input. + ! In case of a workspace query, the default + ! minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + RWORK(1) = 1 + ZWORK(1) = 2 + ZWORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + + IMINWR = 1 + MLRWRK = MAX(1,N) + MLWORK = 2 + OLWORK = 2 + SELECT CASE ( WHTSVD ) + CASE (1) + ! The following is specified as the minimal + ! length of WORK in the definition of ZGESVD: + ! MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) + MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) + MLWORK = MAX(MLWORK,MWRSVD) + MLRWRK = MAX(MLRWRK,N + 5*MIN(M,N)) + IF ( LQUERY ) THEN + CALL ZGESVD( 'O', 'S', M, N, X, LDX, RWORK, & + B, LDB, W, LDW, ZWORK, -1, RDUMMY, INFO1 ) + LWRSVD = INT( ZWORK(1) ) + OLWORK = MAX(OLWORK,LWRSVD) + END IF + CASE (2) + ! The following is specified as the minimal + ! length of WORK in the definition of ZGESDD: + ! MWRSDD = 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N). + ! RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) + ! In LAPACK 3.10.1 RWORK is defined differently. + ! Below we take max over the two versions. + ! IMINWR = 8*MIN(M,N) + MWRSDD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) + MLWORK = MAX(MLWORK,MWRSDD) + IMINWR = 8*MIN(M,N) + MLRWRK = MAX( MLRWRK, N + & + MAX( 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), & + 5*MIN(M,N)*MIN(M,N)+5*MIN(M,N), & + 2*MAX(M,N)*MIN(M,N)+ & + 2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) + IF ( LQUERY ) THEN + CALL ZGESDD( 'O', M, N, X, LDX, RWORK, B,LDB,& + W, LDW, ZWORK, -1, RDUMMY, IWORK, INFO1 ) + LWRSDD = MAX( MWRSDD,INT( ZWORK(1) )) + ! Possible bug in ZGESDD optimal workspace size. + OLWORK = MAX(OLWORK,LWRSDD) + END IF + CASE (3) + CALL ZGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, RWORK, Z, LDZ, W, LDW, NUMRNK, & + IWORK, -1, ZWORK, -1, RDUMMY, -1, INFO1 ) + IMINWR = IWORK(1) + MWRSVQ = INT(ZWORK(2)) + MLWORK = MAX(MLWORK,MWRSVQ) + MLRWRK = MAX(MLRWRK,N + INT(RDUMMY(1))) + IF ( LQUERY ) THEN + LWRSVQ = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,LWRSVQ) + END IF + CASE (4) + JSVOPT = 'J' + CALL ZGEJSV( 'F', 'U', JSVOPT, 'R', 'N', 'P', M, & + N, X, LDX, RWORK, Z, LDZ, W, LDW, & + ZWORK, -1, RDUMMY, -1, IWORK, INFO1 ) + IMINWR = IWORK(1) + MWRSVJ = INT(ZWORK(2)) + MLWORK = MAX(MLWORK,MWRSVJ) + MLRWRK = MAX(MLRWRK,N + MAX(7,INT(RDUMMY(1)))) + IF ( LQUERY ) THEN + LWRSVJ = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,LWRSVJ) + END IF + END SELECT + IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN + JOBZL = 'V' + ELSE + JOBZL = 'N' + END IF + ! Workspace calculation to the ZGEEV call + MWRKEV = MAX( 1, 2*N ) + MLWORK = MAX(MLWORK,MWRKEV) + MLRWRK = MAX(MLRWRK,N+2*N) + IF ( LQUERY ) THEN + CALL ZGEEV( 'N', JOBZL, N, S, LDS, EIGS, & + W, LDW, W, LDW, ZWORK, -1, RWORK, INFO1 ) + LWRKEV = INT(ZWORK(1)) + OLWORK = MAX( OLWORK, LWRKEV ) + END IF +! + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -30 + IF ( LRWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -28 + IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -26 + + END IF +! + IF( INFO /= 0 ) THEN + CALL XERBLA( 'ZGEDMD', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + RWORK(1) = MLRWRK + ZWORK(1) = MLWORK + ZWORK(2) = OLWORK + RETURN + END IF +!............................................................ +! + OFL = DLAMCH('O') + SMALL = DLAMCH('S') + BADXY = .FALSE. +! +! <1> Optional scaling of the snapshots (columns of X, Y) +! ========================================================== + IF ( SCCOLX ) THEN + ! The columns of X will be normalized. + ! To prevent overflows, the column norms of X are + ! carefully computed using ZLASSQ. + K = 0 + DO i = 1, N + !WORK(i) = DZNRM2( M, X(1,i), 1 ) + SCALE = ZERO + CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM ) + IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN + K = 0 + INFO = -8 + CALL XERBLA('ZGEDMD',-INFO) + END IF + IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of X(:,i) overflows. First, X(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. +! Next, the norm of X(:,i) is stored without +! overflow as RWORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of X(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL ZLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, X(1,i), LDX, INFO2 ) + RWORK(i) = - SCALE * ( ROOTSC / DBLE(M) ) + ELSE +! X(:,i) will be scaled to unit 2-norm + RWORK(i) = SCALE * ROOTSC + CALL ZLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, & + X(1,i), LDX, INFO2 ) ! LAPACK CALL +! X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC + END IF + ELSE + RWORK(i) = ZERO + K = K + 1 + END IF + END DO + IF ( K == N ) THEN + ! All columns of X are zero. Return error code -8. + ! (the 8th input variable had an illegal value) + K = 0 + INFO = -8 + CALL XERBLA('ZGEDMD',-INFO) + RETURN + END IF + DO i = 1, N +! Now, apply the same scaling to the columns of Y. + IF ( RWORK(i) > ZERO ) THEN + CALL ZDSCAL( M, ONE/RWORK(i), Y(1,i), 1 ) ! BLAS CALL +! Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC + ELSE IF ( RWORK(i) < ZERO ) THEN + CALL ZLASCL( 'G', 0, 0, -RWORK(i), & + ONE/DBLE(M), M, 1, Y(1,i), LDY, INFO2 ) ! LAPACK CALL + ELSE IF ( ABS(Y(IZAMAX(M, Y(1,i),1),i )) & + /= ZERO ) THEN +! X(:,i) is zero vector. For consistency, +! Y(:,i) should also be zero. If Y(:,i) is not +! zero, then the data might be inconsistent or +! corrupted. If JOBS == 'C', Y(:,i) is set to +! zero and a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + IF ( LSAME(JOBS,'C')) & + CALL ZDSCAL( M, ZERO, Y(1,i), 1 ) ! BLAS CALL + END IF + END DO + END IF + ! + IF ( SCCOLY ) THEN + ! The columns of Y will be normalized. + ! To prevent overflows, the column norms of Y are + ! carefully computed using ZLASSQ. + DO i = 1, N + !RWORK(i) = DZNRM2( M, Y(1,i), 1 ) + SCALE = ZERO + CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM ) + IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN + K = 0 + INFO = -10 + CALL XERBLA('ZGEDMD',-INFO) + END IF + IF ( SCALE /= ZERO .AND. (SSUM /= ZERO) ) THEN + ROOTSC = SQRT(SSUM) + IF ( SCALE .GE. (OFL / ROOTSC) ) THEN +! Norm of Y(:,i) overflows. First, Y(:,i) +! is scaled by +! ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. +! Next, the norm of Y(:,i) is stored without +! overflow as RWORK(i) = - SCALE * (ROOTSC/M), +! the minus sign indicating the 1/M factor. +! Scaling is performed without overflow, and +! underflow may occur in the smallest entries +! of Y(:,i). The relative backward and forward +! errors are small in the ell_2 norm. + CALL ZLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, & + M, 1, Y(1,i), LDY, INFO2 ) + RWORK(i) = - SCALE * ( ROOTSC / DBLE(M) ) + ELSE +! Y(:,i) will be scaled to unit 2-norm + RWORK(i) = SCALE * ROOTSC + CALL ZLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, & + Y(1,i), LDY, INFO2 ) ! LAPACK CALL +! Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC + END IF + ELSE + RWORK(i) = ZERO + END IF + END DO + DO i = 1, N +! Now, apply the same scaling to the columns of X. + IF ( RWORK(i) > ZERO ) THEN + CALL ZDSCAL( M, ONE/RWORK(i), X(1,i), 1 ) ! BLAS CALL +! X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC + ELSE IF ( RWORK(i) < ZERO ) THEN + CALL ZLASCL( 'G', 0, 0, -RWORK(i), & + ONE/DBLE(M), M, 1, X(1,i), LDX, INFO2 ) ! LAPACK CALL + ELSE IF ( ABS(X(IZAMAX(M, X(1,i),1),i )) & + /= ZERO ) THEN +! Y(:,i) is zero vector. If X(:,i) is not +! zero, then a warning flag is raised. +! The computation continues but the +! situation will be reported in the output. + BADXY = .TRUE. + END IF + END DO + END IF +! +! <2> SVD of the data snapshot matrix X. +! ===================================== +! The left singular vectors are stored in the array X. +! The right singular vectors are in the array W. +! The array W will later on contain the eigenvectors +! of a Rayleigh quotient. + NUMRNK = N + SELECT CASE ( WHTSVD ) + CASE (1) + CALL ZGESVD( 'O', 'S', M, N, X, LDX, RWORK, B, & + LDB, W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 ) ! LAPACK CALL + T_OR_N = 'C' + CASE (2) + CALL ZGESDD( 'O', M, N, X, LDX, RWORK, B, LDB, W, & + LDW, ZWORK, LZWORK, RWORK(N+1), IWORK, INFO1 ) ! LAPACK CALL + T_OR_N = 'C' + CASE (3) + CALL ZGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, & + X, LDX, RWORK, Z, LDZ, W, LDW, & + NUMRNK, IWORK, LIWORK, ZWORK, & + LZWORK, RWORK(N+1), LRWORK-N, INFO1) ! LAPACK CALL + CALL ZLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'C' + CASE (4) + CALL ZGEJSV( 'F', 'U', JSVOPT, 'R', 'N', 'P', M, & + N, X, LDX, RWORK, Z, LDZ, W, LDW, & + ZWORK, LZWORK, RWORK(N+1), LRWORK-N, IWORK, INFO1 ) ! LAPACK CALL + CALL ZLACPY( 'A', M, N, Z, LDZ, X, LDX ) ! LAPACK CALL + T_OR_N = 'N' + XSCL1 = RWORK(N+1) + XSCL2 = RWORK(N+2) + IF ( XSCL1 /= XSCL2 ) THEN + ! This is an exceptional situation. If the + ! data matrices are not scaled and the + ! largest singular value of X overflows. + ! In that case ZGEJSV can return the SVD + ! in scaled form. The scaling factor can be used + ! to rescale the data (X and Y). + CALL ZLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2 ) + END IF + END SELECT +! + IF ( INFO1 > 0 ) THEN + ! The SVD selected subroutine did not converge. + ! Return with an error code. + INFO = 2 + RETURN + END IF +! + IF ( RWORK(1) == ZERO ) THEN + ! The largest computed singular value of (scaled) + ! X is zero. Return error code -8 + ! (the 8th input variable had an illegal value). + K = 0 + INFO = -8 + CALL XERBLA('ZGEDMD',-INFO) + RETURN + END IF +! + !<3> Determine the numerical rank of the data + ! snapshots matrix X. This depends on the + ! parameters NRNK and TOL. + + SELECT CASE ( NRNK ) + CASE ( -1 ) + K = 1 + DO i = 2, NUMRNK + IF ( ( RWORK(i) <= RWORK(1)*TOL ) .OR. & + ( RWORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE ( -2 ) + K = 1 + DO i = 1, NUMRNK-1 + IF ( ( RWORK(i+1) <= RWORK(i)*TOL ) .OR. & + ( RWORK(i) <= SMALL ) ) EXIT + K = K + 1 + END DO + CASE DEFAULT + K = 1 + DO i = 2, NRNK + IF ( RWORK(i) <= SMALL ) EXIT + K = K + 1 + END DO + END SELECT + ! Now, U = X(1:M,1:K) is the SVD/POD basis for the + ! snapshot data in the input matrix X. + + !<4> Compute the Rayleigh quotient S = U^H * A * U. + ! Depending on the requested outputs, the computation + ! is organized to compute additional auxiliary + ! matrices (for the residuals and refinements). + ! + ! In all formulas below, we need V_k*Sigma_k^(-1) + ! where either V_k is in W(1:N,1:K), or V_k^H is in + ! W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). + IF ( LSAME(T_OR_N, 'N') ) THEN + DO i = 1, K + CALL ZDSCAL( N, ONE/RWORK(i), W(1,i), 1 ) ! BLAS CALL + ! W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i) ! INTRINSIC + END DO + ELSE + ! This non-unit stride access is due to the fact + ! that ZGESVD, ZGESVDQ and ZGESDD return the + ! adjoint matrix of the right singular vectors. + !DO i = 1, K + ! CALL ZDSCAL( N, ONE/RWORK(i), W(i,1), LDW ) ! BLAS CALL + ! ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N) ! INTRINSIC + !END DO + DO i = 1, K + RWORK(N+i) = ONE/RWORK(i) + END DO + DO j = 1, N + DO i = 1, K + W(i,j) = CMPLX(RWORK(N+i),ZERO,KIND=WP)*W(i,j) + END DO + END DO + END IF +! + IF ( WNTREF ) THEN + ! + ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) + ! for computing the refined Ritz vectors + ! (optionally, outside ZGEDMD). + CALL ZGEMM( 'N', T_OR_N, M, K, N, ZONE, Y, LDY, W, & + LDW, ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! INTRINSIC, for T_OR_N=='C' + ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRINSIC, for T_OR_N=='N' + ! + ! At this point Z contains + ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and + ! this is needed for computing the residuals. + ! This matrix is returned in the array B and + ! it can be used to compute refined Ritz vectors. + CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) ! BLAS CALL + ! B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC + + CALL ZGEMM( 'C', 'N', K, K, M, ZONE, X, LDX, Z, & + LDZ, ZZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(TRANSPOSE(CONJG(X(1:M,1:K))),Z(1:M,1:K)) ! INTRINSIC + ! At this point S = U^H * A * U is the Rayleigh quotient. + ELSE + ! A * U(:,1:K) is not explicitly needed and the + ! computation is organized differently. The Rayleigh + ! quotient is computed more efficiently. + CALL ZGEMM( 'C', 'N', K, N, M, ZONE, X, LDX, Y, LDY, & + ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:K,1:N) = MATMUL( TRANSPOSE(CONJG(X(1:M,1:K))), Y(1:M,1:N) ) ! INTRINSIC + ! + CALL ZGEMM( 'N', T_OR_N, K, K, N, ZONE, Z, LDZ, W, & + LDW, ZZERO, S, LDS ) ! BLAS CALL + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! INTRINSIC, for T_OR_N=='T' + ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRINSIC, for T_OR_N=='N' + ! At this point S = U^H * A * U is the Rayleigh quotient. + ! If the residuals are requested, save scaled V_k into Z. + ! Recall that V_k or V_k^H is stored in W. + IF ( WNTRES .OR. WNTEX ) THEN + IF ( LSAME(T_OR_N, 'N') ) THEN + CALL ZLACPY( 'A', N, K, W, LDW, Z, LDZ ) + ELSE + CALL ZLACPY( 'A', K, N, W, LDW, Z, LDZ ) + END IF + END IF + END IF +! + !<5> Compute the Ritz values and (if requested) the + ! right eigenvectors of the Rayleigh quotient. + ! + CALL ZGEEV( 'N', JOBZL, K, S, LDS, EIGS, W, LDW, & + W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 ) ! LAPACK CALL + ! + ! W(1:K,1:K) contains the eigenvectors of the Rayleigh + ! quotient. See the description of Z. + ! Also, see the description of ZGEEV. + IF ( INFO1 > 0 ) THEN + ! ZGEEV failed to compute the eigenvalues and + ! eigenvectors of the Rayleigh quotient. + INFO = 3 + RETURN + END IF +! + ! <6> Compute the eigenvectors (if requested) and, + ! the residuals (if requested). + ! + IF ( WNTVEC .OR. WNTEX ) THEN + IF ( WNTRES ) THEN + IF ( WNTREF ) THEN + ! Here, if the refinement is requested, we have + ! A*U(:,1:K) already computed and stored in Z. + ! For the residuals, need Y = A * U(:,1;K) * W. + CALL ZGEMM( 'N', 'N', M, K, K, ZONE, Z, LDZ, W, & + LDW, ZZERO, Y, LDY ) ! BLAS CALL + ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC + ! This frees Z; Y contains A * U(:,1:K) * W. + ELSE + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) (or its adjoint) is stored in Z + CALL ZGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, & + W, LDW, ZZERO, S, LDS ) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + LDS, ZZERO, Z, LDZ ) + ! Save a copy of Z into Y and free Z for holding + ! the Ritz vectors. + CALL ZLACPY( 'A', M, K, Z, LDZ, Y, LDY ) + IF ( WNTEX ) CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF + ELSE IF ( WNTEX ) THEN + ! Compute S = V_k * Sigma_k^(-1) * W, where + ! V_k * Sigma_k^(-1) is stored in Z + CALL ZGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, & + W, LDW, ZZERO, S, LDS ) + ! Then, compute Z = Y * S = + ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = + ! = A * U(:,1:K) * W(1:K,1:K) + CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + LDS, ZZERO, B, LDB ) + ! The above call replaces the following two calls + ! that were used in the developing-testing phase. + ! CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & + ! LDS, ZZERO, Z, LDZ) + ! Save a copy of Z into B and free Z for holding + ! the Ritz vectors. + ! CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) + END IF +! + ! Compute the Ritz vectors + IF ( WNTVEC ) CALL ZGEMM( 'N', 'N', M, K, K, ZONE, X, LDX, W, LDW, & + ZZERO, Z, LDZ ) ! BLAS CALL + ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC +! + IF ( WNTRES ) THEN + DO i = 1, K + CALL ZAXPY( M, -EIGS(i), Z(1,i), 1, Y(1,i), 1 ) ! BLAS CALL + ! Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i) ! INTRINSIC + RES(i) = DZNRM2( M, Y(1,i), 1 ) ! BLAS CALL + END DO + END IF + END IF +! + IF ( WHTSVD == 4 ) THEN + RWORK(N+1) = XSCL1 + RWORK(N+2) = XSCL2 + END IF +! +! Successful exit. + IF ( .NOT. BADXY ) THEN + INFO = 0 + ELSE + ! A warning on possible data inconsistency. + ! This should be a rare event. + INFO = 4 + END IF +!............................................................ + RETURN +! ...... + END SUBROUTINE ZGEDMD + diff --git a/lapack-netlib/SRC/zgedmdq.f90 b/lapack-netlib/SRC/zgedmdq.f90 new file mode 100644 index 000000000..51be72a32 --- /dev/null +++ b/lapack-netlib/SRC/zgedmdq.f90 @@ -0,0 +1,689 @@ +SUBROUTINE ZGEDMDQ( JOBS, JOBZ, JOBR, JOBQ, JOBT, JOBF, & + WHTSVD, M, N, F, LDF, X, LDX, Y, & + LDY, NRNK, TOL, K, EIGS, & + Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK, LZWORK, WORK, LWORK, & + IWORK, LIWORK, INFO ) +! March 2023 +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +!..... +! Scalar arguments + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBQ, & + JOBT, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDF, LDX, & + LDY, NRNK, LDZ, LDB, LDV, & + LDS, LZWORK, LWORK, LIWORK + INTEGER, INTENT(OUT) :: INFO, K + REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments + COMPLEX(KIND=WP), INTENT(INOUT) :: F(LDF,*) + COMPLEX(KIND=WP), INTENT(OUT) :: X(LDX,*), Y(LDY,*), & + Z(LDZ,*), B(LDB,*), & + V(LDV,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +!..... +! Purpose +! ======= +! ZGEDMDQ computes the Dynamic Mode Decomposition (DMD) for +! a pair of data snapshot matrices, using a QR factorization +! based compression of the data. For the input matrices +! X and Y such that Y = A*X with an unaccessible matrix +! A, ZGEDMDQ computes a certain number of Ritz pairs of A using +! the standard Rayleigh-Ritz extraction from a subspace of +! range(X) that is determined using the leading left singular +! vectors of X. Optionally, ZGEDMDQ returns the residuals +! of the computed Ritz pairs, the information needed for +! a refinement of the Ritz vectors, or the eigenvectors of +! the Exact DMD. +! For further details see the references listed +! below. For more details of the implementation see [3]. +! +! References +! ========== +! [1] P. Schmid: Dynamic mode decomposition of numerical +! and experimental data, +! Journal of Fluid Mechanics 656, 5-28, 2010. +! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +! decompositions: analysis and enhancements, +! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +! [3] Z. Drmac: A LAPACK implementation of the Dynamic +! Mode Decomposition I. Technical report. AIMDyn Inc. +! and LAPACK Working Note 298. +! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +! Brunton, N. Kutz: On Dynamic Mode Decomposition: +! Theory and Applications, Journal of Computational +! Dynamics 1(2), 391 -421, 2014. +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! and supported by +! - DARPA SBIR project "Koopman Operator-Based Forecasting +! for Nonstationary Processes from Near-Term, Limited +! Observational Data" Contract No: W31P4Q-21-C-0007 +! - DARPA PAI project "Physics-Informed Machine Learning +! Methodologies" Contract No: HR0011-18-9-0033 +! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +! Framework for Space-Time Analysis of Process Dynamics" +! Contract No: HR0011-16-C-0116 +! Any opinions, findings and conclusions or recommendations +! expressed in this material are those of the author and +! do not necessarily reflect the views of the DARPA SBIR +! Program Office. +!============================================================ +! Distribution Statement A: +! Approved for Public Release, Distribution Unlimited. +! Cleared by DARPA on September 29, 2022 +!============================================================ +!...................................................................... +! Arguments +! ========= +! JOBS (input) CHARACTER*1 +! Determines whether the initial data snapshots are scaled +! by a diagonal matrix. The data snapshots are the columns +! of F. The leading N-1 columns of F are denoted X and the +! trailing N-1 columns are denoted Y. +! 'S' :: The data snapshots matrices X and Y are multiplied +! with a diagonal matrix D so that X*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'C' :: The snapshots are scaled as with the 'S' option. +! If it is found that an i-th column of X is zero +! vector and the corresponding i-th column of Y is +! non-zero, then the i-th column of Y is set to +! zero and a warning flag is raised. +! 'Y' :: The data snapshots matrices X and Y are multiplied +! by a diagonal matrix D so that Y*D has unit +! nonzero columns (in the Euclidean 2-norm) +! 'N' :: No data scaling. +!..... +! JOBZ (input) CHARACTER*1 +! Determines whether the eigenvectors (Koopman modes) will +! be computed. +! 'V' :: The eigenvectors (Koopman modes) will be computed +! and returned in the matrix Z. +! See the description of Z. +! 'F' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Z*V, where Z +! is orthonormal and V contains the eigenvectors +! of the corresponding Rayleigh quotient. +! See the descriptions of F, V, Z. +! 'Q' :: The eigenvectors (Koopman modes) will be returned +! in factored form as the product Q*Z, where Z +! contains the eigenvectors of the compression of the +! underlying discretized operator onto the span of +! the data snapshots. See the descriptions of F, V, Z. +! Q is from the initial QR factorization. +! 'N' :: The eigenvectors are not computed. +!..... +! JOBR (input) CHARACTER*1 +! Determines whether to compute the residuals. +! 'R' :: The residuals for the computed eigenpairs will +! be computed and stored in the array RES. +! See the description of RES. +! For this option to be legal, JOBZ must be 'V'. +! 'N' :: The residuals are not computed. +!..... +! JOBQ (input) CHARACTER*1 +! Specifies whether to explicitly compute and return the +! unitary matrix from the QR factorization. +! 'Q' :: The matrix Q of the QR factorization of the data +! snapshot matrix is computed and stored in the +! array F. See the description of F. +! 'N' :: The matrix Q is not explicitly computed. +!..... +! JOBT (input) CHARACTER*1 +! Specifies whether to return the upper triangular factor +! from the QR factorization. +! 'R' :: The matrix R of the QR factorization of the data +! snapshot matrix F is returned in the array Y. +! See the description of Y and Further details. +! 'N' :: The matrix R is not returned. +!..... +! JOBF (input) CHARACTER*1 +! Specifies whether to store information needed for post- +! processing (e.g. computing refined Ritz vectors) +! 'R' :: The matrix needed for the refinement of the Ritz +! vectors is computed and stored in the array B. +! See the description of B. +! 'E' :: The unscaled eigenvectors of the Exact DMD are +! computed and returned in the array B. See the +! description of B. +! 'N' :: No eigenvector refinement data is computed. +! To be useful on exit, this option needs JOBQ='Q'. +!..... +! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +! Allows for a selection of the SVD algorithm from the +! LAPACK library. +! 1 :: ZGESVD (the QR SVD algorithm) +! 2 :: ZGESDD (the Divide and Conquer algorithm; if enough +! workspace available, this is the fastest option) +! 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 +! are the most accurate options) +! 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 +! are the most accurate options) +! For the four methods above, a significant difference in +! the accuracy of small singular values is possible if +! the snapshots vary in norm so that X is severely +! ill-conditioned. If small (smaller than EPS*||X||) +! singular values are of interest and JOBS=='N', then +! the options (3, 4) give the most accurate results, where +! the option 4 is slightly better and with stronger +! theoretical background. +! If JOBS=='S', i.e. the columns of X will be normalized, +! then all methods give nearly equally accurate results. +!..... +! M (input) INTEGER, M >= 0 +! The state space dimension (the number of rows of F). +!..... +! N (input) INTEGER, 0 <= N <= M +! The number of data snapshots from a single trajectory, +! taken at equidistant discrete times. This is the +! number of columns of F. +!..... +! F (input/output) COMPLEX(KIND=WP) M-by-N array +! > On entry, +! the columns of F are the sequence of data snapshots +! from a single trajectory, taken at equidistant discrete +! times. It is assumed that the column norms of F are +! in the range of the normalized floating point numbers. +! < On exit, +! If JOBQ == 'Q', the array F contains the orthogonal +! matrix/factor of the QR factorization of the initial +! data snapshots matrix F. See the description of JOBQ. +! If JOBQ == 'N', the entries in F strictly below the main +! diagonal contain, column-wise, the information on the +! Householder vectors, as returned by ZGEQRF. The +! remaining information to restore the orthogonal matrix +! of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). +! See the description of ZWORK. +!..... +! LDF (input) INTEGER, LDF >= M +! The leading dimension of the array F. +!..... +! X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array +! X is used as workspace to hold representations of the +! leading N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, the leading K columns of X contain the leading +! K left singular vectors of the above described content +! of X. To lift them to the space of the left singular +! vectors U(:,1:K) of the input data, pre-multiply with the +! Q factor from the initial QR factorization. +! See the descriptions of F, K, V and Z. +!..... +! LDX (input) INTEGER, LDX >= N +! The leading dimension of the array X. +!..... +! Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array +! Y is used as workspace to hold representations of the +! trailing N-1 snapshots in the orthonormal basis computed +! in the QR factorization of F. +! On exit, +! If JOBT == 'R', Y contains the MIN(M,N)-by-N upper +! triangular factor from the QR factorization of the data +! snapshot matrix F. +!..... +! LDY (input) INTEGER , LDY >= N +! The leading dimension of the array Y. +!..... +! NRNK (input) INTEGER +! Determines the mode how to compute the numerical rank, +! i.e. how to truncate small singular values of the input +! matrix X. On input, if +! NRNK = -1 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(1) +! This option is recommended. +! NRNK = -2 :: i-th singular value sigma(i) is truncated +! if sigma(i) <= TOL*sigma(i-1) +! This option is included for R&D purposes. +! It requires highly accurate SVD, which +! may not be feasible. +! The numerical rank can be enforced by using positive +! value of NRNK as follows: +! 0 < NRNK <= N-1 :: at most NRNK largest singular values +! will be used. If the number of the computed nonzero +! singular values is less than NRNK, then only those +! nonzero values will be used and the actually used +! dimension is less than NRNK. The actual number of +! the nonzero singular values is returned in the variable +! K. See the description of K. +!..... +! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +! The tolerance for truncating small singular values. +! See the description of NRNK. +!..... +! K (output) INTEGER, 0 <= K <= N +! The dimension of the SVD/POD basis for the leading N-1 +! data snapshots (columns of F) and the number of the +! computed Ritz pairs. The value of K is determined +! according to the rule set by the parameters NRNK and +! TOL. See the descriptions of NRNK and TOL. +!..... +! EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array +! The leading K (K<=N-1) entries of EIGS contain +! the computed eigenvalues (Ritz values). +! See the descriptions of K, and Z. +!..... +! Z (workspace/output) COMPLEX(KIND=WP) M-by-(N-1) array +! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +! Z*V, where Z contains orthonormal matrix (the product of +! Q from the initial QR factorization and the SVD/POD_basis +! returned by ZGEDMD in X) and the second factor (the +! eigenvectors of the Rayleigh quotient) is in the array V, +! as returned by ZGEDMD. That is, X(:,1:K)*V(:,i) +! is an eigenvector corresponding to EIGS(i). The columns +! of V(1:K,1:K) are the computed eigenvectors of the +! K-by-K Rayleigh quotient. +! See the descriptions of EIGS, X and V. +!..... +! LDZ (input) INTEGER , LDZ >= M +! The leading dimension of the array Z. +!..... +! RES (output) REAL(KIND=WP) (N-1)-by-1 array +! RES(1:K) contains the residuals for the K computed +! Ritz pairs, +! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +! See the description of EIGS and Z. +!..... +! B (output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array. +! IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can +! be used for computing the refined vectors; see further +! details in the provided references. +! If JOBF == 'E', B(1:N,1;K) contains +! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +! Exact DMD, up to scaling by the inverse eigenvalues. +! In both cases, the content of B can be lifted to the +! original dimension of the input data by pre-multiplying +! with the Q factor from the initial QR factorization. +! Here A denotes a compression of the underlying operator. +! See the descriptions of F and X. +! If JOBF =='N', then B is not referenced. +!..... +! LDB (input) INTEGER, LDB >= MIN(M,N) +! The leading dimension of the array B. +!..... +! V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array +! On exit, V(1:K,1:K) V contains the K eigenvectors of +! the Rayleigh quotient. The Ritz vectors +! (returned in Z) are the product of Q from the initial QR +! factorization (see the description of F) X (see the +! description of X) and V. +!..... +! LDV (input) INTEGER, LDV >= N-1 +! The leading dimension of the array V. +!..... +! S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array +! The array S(1:K,1:K) is used for the matrix Rayleigh +! quotient. This content is overwritten during +! the eigenvalue decomposition by ZGEEV. +! See the description of K. +!..... +! LDS (input) INTEGER, LDS >= N-1 +! The leading dimension of the array S. +!..... +! ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array +! On exit, +! ZWORK(1:MIN(M,N)) contains the scalar factors of the +! elementary reflectors as returned by ZGEQRF of the +! M-by-N input matrix F. +! If the call to ZGEDMDQ is only workspace query, then +! ZWORK(1) contains the minimal complex workspace length and +! ZWORK(2) is the optimal complex workspace length. +! Hence, the length of work is at least 2. +! See the description of LZWORK. +!..... +! LZWORK (input) INTEGER +! The minimal length of the workspace vector ZWORK. +! LZWORK is calculated as follows: +! Let MLWQR = N (minimal workspace for ZGEQRF[M,N]) +! MLWDMD = minimal workspace for ZGEDMD (see the +! description of LWORK in ZGEDMD) +! MLWMQR = N (minimal workspace for +! ZUNMQR['L','N',M,N,N]) +! MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) +! MINMN = MIN(M,N) +! Then +! LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) +! is further updated as follows: +! if JOBZ == 'V' or JOBZ == 'F' THEN +! LZWORK = MAX(LZWORK, MINMN+MLWMQR) +! if JOBQ == 'Q' THEN +! LZWORK = MAX(ZLWORK, MINMN+MLWGQR) +! +!..... +! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +! On exit, +! WORK(1:N-1) contains the singular values of +! the input submatrix F(1:M,1:N-1). +! If the call to ZGEDMDQ is only workspace query, then +! WORK(1) contains the minimal workspace length and +! WORK(2) is the optimal workspace length. hence, the +! length of work is at least 2. +! See the description of LWORK. +!..... +! LWORK (input) INTEGER +! The minimal length of the workspace vector WORK. +! LWORK is the same as in ZGEDMD, because in ZGEDMDQ +! only ZGEDMD requires real workspace for snapshots +! of dimensions MIN(M,N)-by-(N-1). +! If on entry LWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace length for WORK. +!..... +! IWORK (workspace/output) INTEGER LIWORK-by-1 array +! Workspace that is required only if WHTSVD equals +! 2 , 3 or 4. (See the description of WHTSVD). +! If on entry LWORK =-1 or LIWORK=-1, then the +! minimal length of IWORK is computed and returned in +! IWORK(1). See the description of LIWORK. +!..... +! LIWORK (input) INTEGER +! The minimal length of the workspace vector IWORK. +! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +! Let M1=MIN(M,N), N1=N-1. Then +! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) +! If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) +! If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) +! If on entry LIWORK = -1, then a workspace query is +! assumed and the procedure only computes the minimal +! and the optimal workspace lengths for both WORK and +! IWORK. See the descriptions of WORK and IWORK. +!..... +! INFO (output) INTEGER +! -i < 0 :: On entry, the i-th argument had an +! illegal value +! = 0 :: Successful return. +! = 1 :: Void input. Quick exit (M=0 or N=0). +! = 2 :: The SVD computation of X did not converge. +! Suggestion: Check the input data and/or +! repeat with different WHTSVD. +! = 3 :: The computation of the eigenvalues did not +! converge. +! = 4 :: If data scaling was requested on input and +! the procedure found inconsistency in the data +! such that for some column index i, +! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +! to zero if JOBS=='C'. The computation proceeds +! with original or modified data and warning +! flag is set with INFO=4. +!............................................................. +!............................................................. +! Parameters +! ~~~~~~~~~~ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +! COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) +! +! Local scalars +! ~~~~~~~~~~~~~ + INTEGER :: IMINWR, INFO1, MINMN, MLRWRK, & + MLWDMD, MLWGQR, MLWMQR, MLWORK, & + MLWQR, OLWDMD, OLWGQR, OLWMQR, & + OLWORK, OLWQR + LOGICAL :: LQUERY, SCCOLX, SCCOLY, WANTQ, & + WNTTRF, WNTRES, WNTVEC, WNTVCF, & + WNTVCQ, WNTREF, WNTEX + CHARACTER(LEN=1) :: JOBVL +! +! External functions (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~ + LOGICAL LSAME + EXTERNAL LSAME +! +! External subroutines (BLAS and LAPACK) +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL ZGEQRF, ZLACPY, ZLASET, ZUNGQR, & + ZUNMQR, XERBLA + +! External subroutines +! ~~~~~~~~~~~~~~~~~~~~ + EXTERNAL ZGEDMD + +! Intrinsic functions +! ~~~~~~~~~~~~~~~~~~~ + INTRINSIC MAX, MIN, INT + !.......................................................... + ! + ! Test the input arguments + WNTRES = LSAME(JOBR,'R') + SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' ) + SCCOLY = LSAME(JOBS,'Y') + WNTVEC = LSAME(JOBZ,'V') + WNTVCF = LSAME(JOBZ,'F') + WNTVCQ = LSAME(JOBZ,'Q') + WNTREF = LSAME(JOBF,'R') + WNTEX = LSAME(JOBF,'E') + WANTQ = LSAME(JOBQ,'Q') + WNTTRF = LSAME(JOBT,'R') + MINMN = MIN(M,N) + INFO = 0 + LQUERY = ( (LZWORK == -1) .OR. (LWORK == -1) .OR. (LIWORK == -1) ) +! + IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. & + LSAME(JOBS,'N')) ) THEN + INFO = -1 + ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ & + .OR. LSAME(JOBZ,'N')) ) THEN + INFO = -2 + ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR. & + ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN + INFO = -3 + ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN + INFO = -4 + ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) ) THEN + INFO = -5 + ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR. & + LSAME(JOBF,'N') ) ) THEN + INFO = -6 + ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR. & + (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN + INFO = -7 + ELSE IF ( M < 0 ) THEN + INFO = -8 + ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN + INFO = -9 + ELSE IF ( LDF < M ) THEN + INFO = -11 + ELSE IF ( LDX < MINMN ) THEN + INFO = -13 + ELSE IF ( LDY < MINMN ) THEN + INFO = -15 + ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. & + ((NRNK >= 1).AND.(NRNK <=N ))) ) THEN + INFO = -16 + ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN + INFO = -17 + ELSE IF ( LDZ < M ) THEN + INFO = -21 + ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN + INFO = -24 + ELSE IF ( LDV < N-1 ) THEN + INFO = -26 + ELSE IF ( LDS < N-1 ) THEN + INFO = -28 + END IF +! + IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN + JOBVL = 'V' + ELSE + JOBVL = 'N' + END IF + IF ( INFO == 0 ) THEN + ! Compute the minimal and the optimal workspace + ! requirements. Simulate running the code and + ! determine minimal and optimal sizes of the + ! workspace at any moment of the run. + IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN + ! All output except K is void. INFO=1 signals + ! the void input. In case of a workspace query, + ! the minimal workspace lengths are returned. + IF ( LQUERY ) THEN + IWORK(1) = 1 + ZWORK(1) = 2 + ZWORK(2) = 2 + WORK(1) = 2 + WORK(2) = 2 + ELSE + K = 0 + END IF + INFO = 1 + RETURN + END IF + + MLRWRK = 2 + MLWORK = 2 + OLWORK = 2 + IMINWR = 1 + MLWQR = MAX(1,N) ! Minimal workspace length for ZGEQRF. + MLWORK = MAX(MLWORK,MINMN + MLWQR) + + IF ( LQUERY ) THEN + CALL ZGEQRF( M, N, F, LDF, ZWORK, ZWORK, -1, & + INFO1 ) + OLWQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,MINMN + OLWQR) + END IF + CALL ZGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + EIGS, Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK, -1, WORK, -1, IWORK,& + -1, INFO1 ) + MLWDMD = INT(ZWORK(1)) + MLWORK = MAX(MLWORK, MINMN + MLWDMD) + MLRWRK = MAX(MLRWRK, INT(WORK(1))) + IMINWR = MAX(IMINWR, IWORK(1)) + IF ( LQUERY ) THEN + OLWDMD = INT(ZWORK(2)) + OLWORK = MAX(OLWORK, MINMN+OLWDMD) + END IF + IF ( WNTVEC .OR. WNTVCF ) THEN + MLWMQR = MAX(1,N) + MLWORK = MAX(MLWORK,MINMN+MLWMQR) + IF ( LQUERY ) THEN + CALL ZUNMQR( 'L','N', M, N, MINMN, F, LDF, & + ZWORK, Z, LDZ, ZWORK, -1, INFO1 ) + OLWMQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,MINMN+OLWMQR) + END IF + END IF + IF ( WANTQ ) THEN + MLWGQR = MAX(1,N) + MLWORK = MAX(MLWORK,MINMN+MLWGQR) + IF ( LQUERY ) THEN + CALL ZUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, & + ZWORK, -1, INFO1 ) + OLWGQR = INT(ZWORK(1)) + OLWORK = MAX(OLWORK,MINMN+OLWGQR) + END IF + END IF + IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -34 + IF ( LWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -32 + IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -30 + END IF + IF( INFO /= 0 ) THEN + CALL XERBLA( 'ZGEDMDQ', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN +! Return minimal and optimal workspace sizes + IWORK(1) = IMINWR + ZWORK(1) = MLWORK + ZWORK(2) = OLWORK + WORK(1) = MLRWRK + WORK(2) = MLRWRK + RETURN + END IF +!..... +! Initial QR factorization that is used to represent the +! snapshots as elements of lower dimensional subspace. +! For large scale computation with M >> N, at this place +! one can use an out of core QRF. +! + CALL ZGEQRF( M, N, F, LDF, ZWORK, & + ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) +! +! Define X and Y as the snapshots representations in the +! orthogonal basis computed in the QR factorization. +! X corresponds to the leading N-1 and Y to the trailing +! N-1 snapshots. + CALL ZLASET( 'L', MINMN, N-1, ZZERO, ZZERO, X, LDX ) + CALL ZLACPY( 'U', MINMN, N-1, F, LDF, X, LDX ) + CALL ZLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY ) + IF ( M >= 3 ) THEN + CALL ZLASET( 'L', MINMN-2, N-2, ZZERO, ZZERO, & + Y(3,1), LDY ) + END IF +! +! Compute the DMD of the projected snapshot pairs (X,Y) + CALL ZGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, & + N-1, X, LDX, Y, LDY, NRNK, TOL, K, & + EIGS, Z, LDZ, RES, B, LDB, V, LDV, & + S, LDS, ZWORK(MINMN+1), LZWORK-MINMN, & + WORK, LWORK, IWORK, LIWORK, INFO1 ) + IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN + ! Return with error code. See ZGEDMD for details. + INFO = INFO1 + RETURN + ELSE + INFO = INFO1 + END IF +! +! The Ritz vectors (Koopman modes) can be explicitly +! formed or returned in factored form. + IF ( WNTVEC ) THEN + ! Compute the eigenvectors explicitly. + IF ( M > MINMN ) CALL ZLASET( 'A', M-MINMN, K, ZZERO, & + ZZERO, Z(MINMN+1,1), LDZ ) + CALL ZUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, & + LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + ELSE IF ( WNTVCF ) THEN + ! Return the Ritz vectors (eigenvectors) in factored + ! form Z*V, where Z contains orthonormal matrix (the + ! product of Q from the initial QR factorization and + ! the SVD/POD_basis returned by ZGEDMD in X) and the + ! second factor (the eigenvectors of the Rayleigh + ! quotient) is in the array V, as returned by ZGEDMD. + CALL ZLACPY( 'A', N, K, X, LDX, Z, LDZ ) + IF ( M > N ) CALL ZLASET( 'A', M-N, K, ZZERO, ZZERO, & + Z(N+1,1), LDZ ) + CALL ZUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, & + LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + END IF +! +! Some optional output variables: +! +! The upper triangular factor R in the initial QR +! factorization is optionally returned in the array Y. +! This is useful if this call to ZGEDMDQ is to be +! followed by a streaming DMD that is implemented in a +! QR compressed form. + IF ( WNTTRF ) THEN ! Return the upper triangular R in Y + CALL ZLASET( 'A', MINMN, N, ZZERO, ZZERO, Y, LDY ) + CALL ZLACPY( 'U', MINMN, N, F, LDF, Y, LDY ) + END IF +! +! The orthonormal/unitary factor Q in the initial QR +! factorization is optionally returned in the array F. +! Same as with the triangular factor above, this is +! useful in a streaming DMD. + IF ( WANTQ ) THEN ! Q overwrites F + CALL ZUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, & + ZWORK(MINMN+1), LZWORK-MINMN, INFO1 ) + END IF +! + RETURN +! + END SUBROUTINE ZGEDMDQ + \ No newline at end of file From c0865ab0fe8579a2716b4f12fae4e3f2f6b1bede Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 10:45:29 +0200 Subject: [PATCH 134/718] Add LAPACKE interfaces for Dynamic Mode Decomposition (Reference-LAPACK PR 736) --- lapack-netlib/LAPACKE/src/lapacke_cgedmd.c | 115 ++++++++++ .../LAPACKE/src/lapacke_cgedmd_work.c | 180 +++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c | 123 +++++++++++ .../LAPACKE/src/lapacke_cgedmdq_work.c | 205 ++++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_dgedmd.c | 112 ++++++++++ .../LAPACKE/src/lapacke_dgedmd_work.c | 179 +++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c | 119 ++++++++++ .../LAPACKE/src/lapacke_dgedmdq_work.c | 200 +++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_sgedmd.c | 112 ++++++++++ .../LAPACKE/src/lapacke_sgedmd_work.c | 179 +++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c | 119 ++++++++++ .../LAPACKE/src/lapacke_sgedmdq_work.c | 200 +++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_zgedmd.c | 116 ++++++++++ .../LAPACKE/src/lapacke_zgedmd_work.c | 182 ++++++++++++++++ lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c | 123 +++++++++++ .../LAPACKE/src/lapacke_zgedmdq_work.c | 205 ++++++++++++++++++ 16 files changed, 2469 insertions(+) create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgedmd.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgedmd.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgedmd.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgedmd.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c new file mode 100644 index 000000000..a269b0daf --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c @@ -0,0 +1,115 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + lapack_complex_float* x, lapack_int ldx, + lapack_complex_float* y, lapack_int ldy, lapack_int k, + lapack_complex_float* reig, lapack_complex_float* imeig, + lapack_complex_float* z, lapack_int ldz, + lapack_complex_float* res, lapack_complex_float* b, + lapack_int ldb, lapack_complex_float* w, + lapack_int ldw, lapack_complex_float* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + lapack_complex_float* work = NULL; + lapack_int* iwork = NULL; + lapack_complex_float work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cgedmd", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -8; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -10; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -15; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -18; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, w, ldw ) ) { + return -20; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -22; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, + x, ldx, y, ldy, k, reig, imeig, z, ldz, res, + b, ldb, w, ldw, s, lds, &work_query, lwork, + &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_C2INT( work_query ); + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, + x, ldx, y, ldy, k, reig, imeig, z, ldz, res, + b, ldb, w, ldw, s, lds, work, lwork, iwork, + liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgedmd", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c new file mode 100644 index 000000000..534934efb --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c @@ -0,0 +1,180 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_float* x, lapack_int ldx, + lapack_complex_float* y, lapack_int ldy, lapack_int k, + lapack_complex_float* reig, lapack_complex_float* imeig, + lapack_complex_float* z, lapack_int ldz, + lapack_complex_float* res, lapack_complex_float* b, + lapack_int ldb, lapack_complex_float* w, + lapack_int ldw, lapack_complex_float* s, lapack_int lds, + lapack_complex_float* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, + &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldw_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + lapack_complex_float* x_t = NULL; + lapack_complex_float* y_t = NULL; + lapack_complex_float* z_t = NULL; + lapack_complex_float* b_t = NULL; + lapack_complex_float* w_t = NULL; + lapack_complex_float* s_t = NULL; + /* Check leading dimension(s) */ + if( ldx < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + if( ldy < n ) { + info = -11; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + if( ldz < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + if( ldb < n ) { + info = -19; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + if( ldw < n ) { + info = -21; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + if( lds < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, + &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + x_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + y_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + z_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + b_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + w_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldw_t * MAX(1,n) ); + if( w_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + s_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_cge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_cge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); + LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, + &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, + w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_5: + LAPACKE_free( w_t ); +exit_level_4: + LAPACKE_free( b_t ); +exit_level_3: + LAPACKE_free( z_t ); +exit_level_2: + LAPACKE_free( y_t ); +exit_level_1: + LAPACKE_free( x_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cgedmd_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c new file mode 100644 index 000000000..60e83729b --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c @@ -0,0 +1,123 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr, + char jobq, char jobt, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, lapack_complex_float* f, + lapack_int ldf, lapack_complex_float* x, + lapack_int ldx, lapack_complex_float* y, + lapack_int ldy, lapack_int nrnk, float tol, + lapack_int k, lapack_complex_float* reig, + lapack_complex_float* imeig, + lapack_complex_float* z, lapack_int ldz, + lapack_complex_float* res, lapack_complex_float* b, + lapack_int ldb, lapack_complex_float* v, + lapack_int ldv, lapack_complex_float* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + lapack_complex_float* work = NULL; + lapack_int* iwork = NULL; + lapack_complex_float work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, f, ldf ) ) { + return -11; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -13; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -15; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -22; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -25; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, v, ldv ) ) { + return -27; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -29; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, &work_query, lwork, + &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_C2INT( work_query ); + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, work, lwork, iwork, + liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgedmdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c new file mode 100644 index 000000000..5bdbd3f56 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c @@ -0,0 +1,205 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + lapack_complex_float* f, lapack_int ldf, + lapack_complex_float* x, lapack_int ldx, + lapack_complex_float* y, lapack_int ldy, + lapack_int nrnk, float tol, lapack_int k, + lapack_complex_float* reig, + lapack_complex_float* imeig, + lapack_complex_float* z, + lapack_int ldz, lapack_complex_float* res, + lapack_complex_float* b, + lapack_int ldb, lapack_complex_float* v, + lapack_int ldv, lapack_complex_float* s, + lapack_int lds, lapack_complex_float* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldf_t = MAX(1,m); + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldv_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + lapack_complex_float* f_t = NULL; + lapack_complex_float* x_t = NULL; + lapack_complex_float* y_t = NULL; + lapack_complex_float* z_t = NULL; + lapack_complex_float* b_t = NULL; + lapack_complex_float* v_t = NULL; + lapack_complex_float* s_t = NULL; + /* Check leading dimension(s) */ + if( ldf < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( ldx < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( ldy < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( ldz < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( ldb < n ) { + info = -26; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( ldv < n ) { + info = -28; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + if( lds < n ) { + info = -30; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 || liwork == -1 ) { + LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + f_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldf_t * MAX(1,n) ); + if( f_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + x_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + y_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + z_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + b_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + v_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + s_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_6; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t ); + LAPACKE_cge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_cge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_cge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t ); + LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_6: + LAPACKE_free( v_t ); +exit_level_5: + LAPACKE_free( b_t ); +exit_level_4: + LAPACKE_free( z_t ); +exit_level_3: + LAPACKE_free( y_t ); +exit_level_2: + LAPACKE_free( x_t ); +exit_level_1: + LAPACKE_free( f_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c new file mode 100644 index 000000000..246d7f649 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c @@ -0,0 +1,112 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + double* x, lapack_int ldx, double* y, lapack_int ldy, + lapack_int k, double* reig, double* imeig, double* z, + lapack_int ldz, double* res, double* b, lapack_int ldb, + double* w, lapack_int ldw, double* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + double* work = NULL; + lapack_int* iwork = NULL; + double work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dgedmd", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -8; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -10; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -15; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -18; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -20; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, w, ldw ) ) { + return -22; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, + x, ldx, y, ldy, k, reig, imeig, z, ldz, res, + b, ldb, w, ldw, s, lds, &work_query, lwork, + &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int) work_query; + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, + x, ldx, y, ldy, k, reig, imeig, z, ldz, res, + b, ldb, w, ldw, s, lds, work, lwork, iwork, + liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgedmd", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c new file mode 100644 index 000000000..4d1169de9 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c @@ -0,0 +1,179 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, double* x, lapack_int ldx, + double* y, lapack_int ldy, lapack_int k, + double* reig, double* imeig, double* z, + lapack_int ldz, double* res, double* b, + lapack_int ldb, double* w, lapack_int ldw, + double* s, lapack_int lds, double* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, + &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldw_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + double* x_t = NULL; + double* y_t = NULL; + double* z_t = NULL; + double* b_t = NULL; + double* w_t = NULL; + double* s_t = NULL; + /* Check leading dimension(s) */ + if( ldx < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + if( ldy < n ) { + info = -11; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + if( ldz < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + if( ldb < n ) { + info = -19; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + if( ldw < n ) { + info = -21; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + if( lds < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, + &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + x_t = (double*)LAPACKE_malloc( sizeof(double) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + y_t = (double*)LAPACKE_malloc( sizeof(double) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + z_t = (double*)LAPACKE_malloc( sizeof(double) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + w_t = (double*)LAPACKE_malloc( sizeof(double) * ldw_t * MAX(1,n) ); + if( w_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + s_t = (double*)LAPACKE_malloc( sizeof(double) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_dge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_dge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); + LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, + &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, + w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_5: + LAPACKE_free( w_t ); +exit_level_4: + LAPACKE_free( b_t ); +exit_level_3: + LAPACKE_free( z_t ); +exit_level_2: + LAPACKE_free( y_t ); +exit_level_1: + LAPACKE_free( x_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dgedmd_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c new file mode 100644 index 000000000..f3d621ba9 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c @@ -0,0 +1,119 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgedmdq( int matrix_layout, char jobs, char jobz, char jobr, + char jobq, char jobt, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, double* f, lapack_int ldf, + double* x, lapack_int ldx, double* y, lapack_int ldy, + lapack_int nrnk, double tol, lapack_int k, + double* reig, double* imeig, double* z, + lapack_int ldz, double* res, double* b, lapack_int ldb, + double* v, lapack_int ldv, double* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + double* work = NULL; + lapack_int* iwork = NULL; + double work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dgedmdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, f, ldf ) ) { + return -11; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -13; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -15; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -22; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -25; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, v, ldv ) ) { + return -27; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -29; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, &work_query, lwork, + &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int) work_query; + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_dgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, work, lwork, iwork, + liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgedmdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c new file mode 100644 index 000000000..51b2a66d8 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c @@ -0,0 +1,200 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + double* f, lapack_int ldf, double* x, + lapack_int ldx, double* y, lapack_int ldy, + lapack_int nrnk, double tol, lapack_int k, + double* reig, double* imeig, double* z, + lapack_int ldz, double* res, double* b, + lapack_int ldb, double* v, lapack_int ldv, + double* s, lapack_int lds, double* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldf_t = MAX(1,m); + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldv_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + double* f_t = NULL; + double* x_t = NULL; + double* y_t = NULL; + double* z_t = NULL; + double* b_t = NULL; + double* v_t = NULL; + double* s_t = NULL; + /* Check leading dimension(s) */ + if( ldf < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( ldx < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( ldy < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( ldz < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( ldb < n ) { + info = -26; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( ldv < n ) { + info = -28; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + if( lds < n ) { + info = -30; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 || liwork == -1 ) { + LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + f_t = (double*)LAPACKE_malloc( sizeof(double) * ldf_t * MAX(1,n) ); + if( f_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + x_t = (double*)LAPACKE_malloc( sizeof(double) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + y_t = (double*)LAPACKE_malloc( sizeof(double) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + z_t = (double*)LAPACKE_malloc( sizeof(double) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + v_t = (double*)LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + s_t = (double*)LAPACKE_malloc( sizeof(double) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_6; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t ); + LAPACKE_dge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_dge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_dge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t ); + LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_6: + LAPACKE_free( v_t ); +exit_level_5: + LAPACKE_free( b_t ); +exit_level_4: + LAPACKE_free( z_t ); +exit_level_3: + LAPACKE_free( y_t ); +exit_level_2: + LAPACKE_free( x_t ); +exit_level_1: + LAPACKE_free( f_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c new file mode 100644 index 000000000..879631b1d --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c @@ -0,0 +1,112 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + float* x, lapack_int ldx, float* y, lapack_int ldy, + lapack_int k, float* reig, float* imeig, float* z, + lapack_int ldz, float* res, float* b, lapack_int ldb, + float* w, lapack_int ldw, float* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + float* work = NULL; + lapack_int* iwork = NULL; + float work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sgedmd", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -8; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -10; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -15; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -18; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -20; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, w, ldw ) ) { + return -22; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, + x, ldx, y, ldy, k, reig, imeig, z, ldz, res, + b, ldb, w, ldw, s, lds, &work_query, lwork, + &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int) work_query; + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, + x, ldx, y, ldy, k, reig, imeig, z, ldz, res, + b, ldb, w, ldw, s, lds, work, lwork, iwork, + liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgedmd", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c new file mode 100644 index 000000000..762a9b271 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c @@ -0,0 +1,179 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, float* x, lapack_int ldx, + float* y, lapack_int ldy, lapack_int k, + float* reig, float* imeig, float* z, + lapack_int ldz, float* res, float* b, + lapack_int ldb, float* w, lapack_int ldw, + float* s, lapack_int lds, float* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, + &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldw_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + float* x_t = NULL; + float* y_t = NULL; + float* z_t = NULL; + float* b_t = NULL; + float* w_t = NULL; + float* s_t = NULL; + /* Check leading dimension(s) */ + if( ldx < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + if( ldy < n ) { + info = -11; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + if( ldz < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + if( ldb < n ) { + info = -19; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + if( ldw < n ) { + info = -21; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + if( lds < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, + &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + x_t = (float*)LAPACKE_malloc( sizeof(float) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + y_t = (float*)LAPACKE_malloc( sizeof(float) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + z_t = (float*)LAPACKE_malloc( sizeof(float) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + w_t = (float*)LAPACKE_malloc( sizeof(float) * ldw_t * MAX(1,n) ); + if( w_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + s_t = (float*)LAPACKE_malloc( sizeof(float) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_sge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_sge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); + LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, + &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, + w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_5: + LAPACKE_free( w_t ); +exit_level_4: + LAPACKE_free( b_t ); +exit_level_3: + LAPACKE_free( z_t ); +exit_level_2: + LAPACKE_free( y_t ); +exit_level_1: + LAPACKE_free( x_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sgedmd_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c new file mode 100644 index 000000000..e202d7fbd --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c @@ -0,0 +1,119 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgedmdq( int matrix_layout, char jobs, char jobz, char jobr, + char jobq, char jobt, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, float* f, lapack_int ldf, + float* x, lapack_int ldx, float* y, lapack_int ldy, + lapack_int nrnk, float tol, lapack_int k, + float* reig, float* imeig, float* z, + lapack_int ldz, float* res, float* b, lapack_int ldb, + float* v, lapack_int ldv, float* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + float* work = NULL; + lapack_int* iwork = NULL; + float work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sgedmdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, f, ldf ) ) { + return -11; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -13; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -15; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -22; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -25; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, v, ldv ) ) { + return -27; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -29; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, &work_query, lwork, + &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int) work_query; + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_sgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, work, lwork, iwork, + liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgedmdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c new file mode 100644 index 000000000..9039898d2 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c @@ -0,0 +1,200 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + float* f, lapack_int ldf, float* x, + lapack_int ldx, float* y, lapack_int ldy, + lapack_int nrnk, float tol, lapack_int k, + float* reig, float* imeig, float* z, + lapack_int ldz, float* res, float* b, + lapack_int ldb, float* v, lapack_int ldv, + float* s, lapack_int lds, float* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldf_t = MAX(1,m); + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldv_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + float* f_t = NULL; + float* x_t = NULL; + float* y_t = NULL; + float* z_t = NULL; + float* b_t = NULL; + float* v_t = NULL; + float* s_t = NULL; + /* Check leading dimension(s) */ + if( ldf < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( ldx < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( ldy < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( ldz < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( ldb < n ) { + info = -26; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( ldv < n ) { + info = -28; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + if( lds < n ) { + info = -30; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 || liwork == -1 ) { + LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + f_t = (float*)LAPACKE_malloc( sizeof(float) * ldf_t * MAX(1,n) ); + if( f_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + x_t = (float*)LAPACKE_malloc( sizeof(float) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + y_t = (float*)LAPACKE_malloc( sizeof(float) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + z_t = (float*)LAPACKE_malloc( sizeof(float) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + v_t = (float*)LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + s_t = (float*)LAPACKE_malloc( sizeof(float) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_6; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t ); + LAPACKE_sge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_sge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_sge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t ); + LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_6: + LAPACKE_free( v_t ); +exit_level_5: + LAPACKE_free( b_t ); +exit_level_4: + LAPACKE_free( z_t ); +exit_level_3: + LAPACKE_free( y_t ); +exit_level_2: + LAPACKE_free( x_t ); +exit_level_1: + LAPACKE_free( f_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c new file mode 100644 index 000000000..f3f421c54 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c @@ -0,0 +1,116 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + lapack_complex_double* x, lapack_int ldx, + lapack_complex_double* y, lapack_int ldy, + lapack_int k, lapack_complex_double* reig, + lapack_complex_double* imeig, lapack_complex_double* z, + lapack_int ldz, lapack_complex_double* res, + lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* w, lapack_int ldw, + lapack_complex_double* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + lapack_complex_double* work = NULL; + lapack_int* iwork = NULL; + lapack_complex_double work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zgedmd", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -8; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -10; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -15; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -18; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -20; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, w, ldw ) ) { + return -22; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, + x, ldx, y, ldy, k, reig, imeig, z, ldz, res, + b, ldb, w, ldw, s, lds, &work_query, lwork, + &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_Z2INT( work_query ); + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, + x, ldx, y, ldy, k, reig, imeig, z, ldz, res, + b, ldb, w, ldw, s, lds, work, lwork, iwork, + liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgedmd", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c new file mode 100644 index 000000000..2554411ec --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c @@ -0,0 +1,182 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zgedmd +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_double* x, + lapack_int ldx, lapack_complex_double* y, + lapack_int ldy, lapack_int k, + lapack_complex_double* reig, + lapack_complex_double* imeig, lapack_complex_double* z, + lapack_int ldz, lapack_complex_double* res, + lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* w, lapack_int ldw, + lapack_complex_double* s, lapack_int lds, + lapack_complex_double* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, + &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldw_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + lapack_complex_double* x_t = NULL; + lapack_complex_double* y_t = NULL; + lapack_complex_double* z_t = NULL; + lapack_complex_double* b_t = NULL; + lapack_complex_double* w_t = NULL; + lapack_complex_double* s_t = NULL; + /* Check leading dimension(s) */ + if( ldx < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + if( ldy < n ) { + info = -11; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + if( ldz < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + if( ldb < n ) { + info = -19; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + if( ldw < n ) { + info = -21; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + if( lds < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, + &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + x_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + y_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + z_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + b_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + w_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldw_t * MAX(1,n) ); + if( w_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + s_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_zge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_zge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); + LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, + &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, + w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_5: + LAPACKE_free( w_t ); +exit_level_4: + LAPACKE_free( b_t ); +exit_level_3: + LAPACKE_free( z_t ); +exit_level_2: + LAPACKE_free( y_t ); +exit_level_1: + LAPACKE_free( x_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zgedmd_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c new file mode 100644 index 000000000..3648ffdf2 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c @@ -0,0 +1,123 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr, + char jobq, char jobt, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, lapack_complex_double* f, + lapack_int ldf, lapack_complex_double* x, + lapack_int ldx, lapack_complex_double* y, + lapack_int ldy, lapack_int nrnk, double tol, + lapack_int k, lapack_complex_double* reig, + lapack_complex_double* imeig, + lapack_complex_double* z, lapack_int ldz, + lapack_complex_double* res, lapack_complex_double* b, + lapack_int ldb, lapack_complex_double* v, + lapack_int ldv, lapack_complex_double* s, lapack_int lds) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_int liwork = -1; + lapack_complex_double* work = NULL; + lapack_int* iwork = NULL; + lapack_complex_double work_query; + lapack_int iwork_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, f, ldf ) ) { + return -11; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, x, ldx ) ) { + return -13; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, y, ldy ) ) { + return -15; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, z, ldz ) ) { + return -22; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) { + return -25; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, v, ldv ) ) { + return -27; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, s, lds ) ) { + return -29; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, &work_query, lwork, + &iwork_query, liwork ); + + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_Z2INT( work_query ); + liwork = iwork_query; + /* Allocate memory for work arrays */ + work = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, + jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, + nrnk, tol, k, reig, imeig, z, ldz, res, + b, ldb, v, ldv, s, lds, work, lwork, iwork, + liwork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgedmdq", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c new file mode 100644 index 000000000..9afceba07 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c @@ -0,0 +1,205 @@ +/***************************************************************************** + Copyright (c) 2014, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zgedmdq +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + lapack_complex_double* f, lapack_int ldf, + lapack_complex_double* x, lapack_int ldx, + lapack_complex_double* y, lapack_int ldy, + lapack_int nrnk, double tol, lapack_int k, + lapack_complex_double* reig, + lapack_complex_double* imeig, + lapack_complex_double* z, + lapack_int ldz, lapack_complex_double* res, + lapack_complex_double* b, + lapack_int ldb, lapack_complex_double* v, + lapack_int ldv, lapack_complex_double* s, + lapack_int lds, lapack_complex_double* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int ldf_t = MAX(1,m); + lapack_int ldx_t = MAX(1,m); + lapack_int ldy_t = MAX(1,m); + lapack_int ldz_t = MAX(1,m); + lapack_int ldb_t = MAX(1,m); + lapack_int ldv_t = MAX(1,m); + lapack_int lds_t = MAX(1,m); + lapack_complex_double* f_t = NULL; + lapack_complex_double* x_t = NULL; + lapack_complex_double* y_t = NULL; + lapack_complex_double* z_t = NULL; + lapack_complex_double* b_t = NULL; + lapack_complex_double* v_t = NULL; + lapack_complex_double* s_t = NULL; + /* Check leading dimension(s) */ + if( ldf < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( ldx < n ) { + info = -14; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( ldy < n ) { + info = -16; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( ldz < n ) { + info = -23; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( ldb < n ) { + info = -26; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( ldv < n ) { + info = -28; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + if( lds < n ) { + info = -30; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 || liwork == -1 ) { + LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + f_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldf_t * MAX(1,n) ); + if( f_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + x_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldx_t * MAX(1,n) ); + if( x_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + y_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldy_t * MAX(1,n) ); + if( y_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + z_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldz_t * MAX(1,n) ); + if( z_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_3; + } + b_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_4; + } + v_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldv_t * MAX(1,n) ); + if( v_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_5; + } + s_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lds_t * MAX(1,n) ); + if( s_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_6; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t ); + LAPACKE_zge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t ); + LAPACKE_zge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t ); + LAPACKE_zge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t ); + LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t ); + LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, + imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + work, &lwork, iwork, &liwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds ); + /* Release memory and exit */ + LAPACKE_free( s_t ); +exit_level_6: + LAPACKE_free( v_t ); +exit_level_5: + LAPACKE_free( b_t ); +exit_level_4: + LAPACKE_free( z_t ); +exit_level_3: + LAPACKE_free( y_t ); +exit_level_2: + LAPACKE_free( x_t ); +exit_level_1: + LAPACKE_free( f_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info ); + } + return info; +} From be5e1ecee3e517ac34dc12c74b829ac5543430a9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 11:27:11 +0200 Subject: [PATCH 135/718] Add LAPACKE interfaces for ?TRSYL3 (Reference-LAPACK PR 651) --- lapack-netlib/LAPACKE/src/Makefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index d35e2a2d7..9d81e2416 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -28,7 +28,6 @@ ############################################################################## # Contents: Native C interface to LAPACK # Author: Intel Corporation -# September, 2010 ############################################################################## # makefile for LAPACKE, used to build lapacke binary. # @@ -608,6 +607,8 @@ lapacke_ctrsna.o \ lapacke_ctrsna_work.o \ lapacke_ctrsyl.o \ lapacke_ctrsyl_work.o \ +lapacke_ctrsyl3.o \ +lapacke_ctrsyl3_work.o \ lapacke_ctrtri.o \ lapacke_ctrtri_work.o \ lapacke_ctrtrs.o \ @@ -1228,6 +1229,8 @@ lapacke_dtrsna.o \ lapacke_dtrsna_work.o \ lapacke_dtrsyl.o \ lapacke_dtrsyl_work.o \ +lapacke_dtrsyl3.o \ +lapacke_dtrsyl3_work.o \ lapacke_dtrtri.o \ lapacke_dtrtri_work.o \ lapacke_dtrtrs.o \ @@ -1800,6 +1803,8 @@ lapacke_strsna.o \ lapacke_strsna_work.o \ lapacke_strsyl.o \ lapacke_strsyl_work.o \ +lapacke_strsyl3.o \ +lapacke_strsyl3_work.o \ lapacke_strtri.o \ lapacke_strtri_work.o \ lapacke_strtrs.o \ @@ -2378,6 +2383,8 @@ lapacke_ztrsna.o \ lapacke_ztrsna_work.o \ lapacke_ztrsyl.o \ lapacke_ztrsyl_work.o \ +lapacke_ztrsyl3.o \ +lapacke_ztrsyl3_work.o \ lapacke_ztrtri.o \ lapacke_ztrtri_work.o \ lapacke_ztrtrs.o \ @@ -2552,7 +2559,7 @@ $(LAPACKELIB): $(OBJ) $(OBJ_S) $(OBJ_C) $(OBJ_D) $(OBJ_Z) $(DEPRECATED) $(EXTEND ifdef BUILD_DEPRECATED $(AR) $(ARFLAGS) $@ $(DEPRECATED) endif -ifdef (USEXBLAS) +ifdef USEXBLAS $(AR) $(ARFLAGS) $@ $(EXTENDED) endif ifdef LAPACKE_WITH_TMG From b6a28adeaf73a3206848edbdc646d139e6aa8b63 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 11:33:03 +0200 Subject: [PATCH 136/718] Add ?TRSYL3 (Reference-LAPACK PR 651) --- cmake/lapacke.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index c8cfa7778..6713beefc 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -558,6 +558,8 @@ set(CSRC lapacke_ctrsna_work.c lapacke_ctrsyl.c lapacke_ctrsyl_work.c + lapacke_ctrsyl3.c + lapacke_ctrsyl3_work.c lapacke_ctrtri.c lapacke_ctrtri_work.c lapacke_ctrtrs.c @@ -1178,6 +1180,8 @@ set(DSRC lapacke_dtrsna_work.c lapacke_dtrsyl.c lapacke_dtrsyl_work.c + lapacke_dtrsyl3.c + lapacke_dtrsyl3_work.c lapacke_dtrtri.c lapacke_dtrtri_work.c lapacke_dtrtrs.c @@ -1750,6 +1754,8 @@ set(SSRC lapacke_strsna_work.c lapacke_strsyl.c lapacke_strsyl_work.c + lapacke_ctrsyl3.c + lapacke_ctrsyl3_work.c lapacke_strtri.c lapacke_strtri_work.c lapacke_strtrs.c @@ -2325,6 +2331,8 @@ set(ZSRC lapacke_ztrsna_work.c lapacke_ztrsyl.c lapacke_ztrsyl_work.c + lapacke_ztrsyl3.c + lapacke_ztrsyl3_work.c lapacke_ztrtri.c lapacke_ztrtri_work.c lapacke_ztrtrs.c From e28fdf71b78613073701b3fc2f05f60ad46bb6ed Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 14:50:54 +0200 Subject: [PATCH 137/718] Add LAPACKE interfaces for Dynamic Mode Decomposition (Reference-LAPACK PR 736) --- lapack-netlib/LAPACKE/src/Makefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 9d81e2416..969288f42 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -137,6 +137,10 @@ lapacke_cgerqf.o \ lapacke_cgerqf_work.o \ lapacke_cgesdd.o \ lapacke_cgesdd_work.o \ +lapacke_cgedmd.o \ +lapacke_cgedmd_work.o \ +lapacke_cgedmdq.o \ +lapacke_cgedmdq_work.o \ lapacke_cgesv.o \ lapacke_cgesv_work.o \ lapacke_cgesvd.o \ @@ -763,6 +767,10 @@ lapacke_dgerqf.o \ lapacke_dgerqf_work.o \ lapacke_dgesdd.o \ lapacke_dgesdd_work.o \ +lapacke_dgedmd.o \ +lapacke_dgedmd_work.o \ +lapacke_dgedmdq.o \ +lapacke_dgedmdq_work.o \ lapacke_dgesv.o \ lapacke_dgesv_work.o \ lapacke_dgesvd.o \ @@ -1343,6 +1351,10 @@ lapacke_sgerqf.o \ lapacke_sgerqf_work.o \ lapacke_sgesdd.o \ lapacke_sgesdd_work.o \ +lapacke_sgedmd.o \ +lapacke_sgedmd_work.o \ +lapacke_sgedmdq.o \ +lapacke_sgedmdq_work.o \ lapacke_sgesv.o \ lapacke_sgesv_work.o \ lapacke_sgesvd.o \ @@ -1913,6 +1925,10 @@ lapacke_zgerqf.o \ lapacke_zgerqf_work.o \ lapacke_zgesdd.o \ lapacke_zgesdd_work.o \ +lapacke_zgedmd.o \ +lapacke_zgedmd_work.o \ +lapacke_zgedmdq.o \ +lapacke_zgedmdq_work.o \ lapacke_zgesv.o \ lapacke_zgesv_work.o \ lapacke_zgesvd.o \ From 83d6ce12893e182bfb212ea5f6d7f3d2f489cda3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 14:55:26 +0200 Subject: [PATCH 138/718] Add interfaces for Dynamic Mode Decomposition (Reference-LAPACK PR 736) --- cmake/lapacke.cmake | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index 6713beefc..f43bf10d0 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -90,6 +90,10 @@ set(CSRC lapacke_cgerqf_work.c lapacke_cgesdd.c lapacke_cgesdd_work.c + lapacke_cgedmd.c + lapacke_cgedmd_work.c + lapacke_cgedmdq.c + lapacke_cgedmdq_work.c lapacke_cgesv.c lapacke_cgesv_work.c lapacke_cgesvd.c @@ -713,6 +717,10 @@ set(DSRC lapacke_dgerqf_work.c lapacke_dgesdd.c lapacke_dgesdd_work.c + lapacke_dgedmd.c + lapacke_dgedmd_work.c + lapacke_dgedmdq.c + lapacke_dgedmdq_work.c lapacke_dgesv.c lapacke_dgesv_work.c lapacke_dgesvd.c @@ -1291,6 +1299,10 @@ set(SSRC lapacke_sgerqf_work.c lapacke_sgesdd.c lapacke_sgesdd_work.c + lapacke_sgedmd.c + lapacke_sgedmd_work.c + lapacke_sgedmdq.c + lapacke_sgedmdq_work.c lapacke_sgesv.c lapacke_sgesv_work.c lapacke_sgesvd.c @@ -1863,6 +1875,10 @@ set(ZSRC lapacke_zgerqf_work.c lapacke_zgesdd.c lapacke_zgesdd_work.c + lapacke_zgedmd.c + lapacke_zgedmd_work.c + lapacke_zgedmdq.c + lapacke_zgedmdq_work.c lapacke_zgesv.c lapacke_zgesv_work.c lapacke_zgesvd.c From defafd1353a13e20452f94a3320593ecf6ffe0b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 15:07:53 +0200 Subject: [PATCH 139/718] Add functions for Dynamic Mode Decomposition (Reference-LAPACK PR 736) --- cmake/lapack.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 544e226ab..077390d90 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -124,7 +124,7 @@ set(SLASRC ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f sgesvdq.f slaorhr_col_getrfnp.f slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f - slatrs3.f strsyl3.f sgelst.f) + slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f @@ -223,7 +223,7 @@ set(CLASRC chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f cungtsqr.f cungtsqr_row.f cunhr_col.f - clatrs3.f ctrsyl3.f cgelst.f) + clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90) set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f @@ -316,7 +316,7 @@ set(DLASRC dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f - dlatrs3.f dtrsyl3.f dgelst.f) + dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90) set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f @@ -419,7 +419,7 @@ set(ZLASRC zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f zungtsqr.f zungtsqr_row.f zunhr_col.f - zlatrs3.f ztrsyl3.f zgelst.f) + zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f From 9f0ef475b4fe9e287cd253aa9c285155887ee99c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 15:12:02 +0200 Subject: [PATCH 140/718] Add Dynamic Mode Decomposition functions (Reference-LAPACK PR 736) --- lapack-netlib/SRC/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 8cac42330..74db14e46 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -207,7 +207,7 @@ SLASRC_O = \ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ - sgesvdq.o slatrs3.o strsyl3.o sgelst.o + sgesvdq.o slatrs3.o strsyl3.o sgelst.o sgedmd.o sgedmdq.o endif @@ -316,7 +316,7 @@ CLASRC_O = \ chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ - cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o + cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o cgedmd.o cgedmdq.o endif ifdef USEXBLAS @@ -417,7 +417,7 @@ DLASRC_O = \ dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \ - dgesvdq.o dlatrs3.o dtrsyl3.o dgelst.o + dgesvdq.o dlatrs3.o dtrsyl3.o dgelst.o dgedmd.o dgedmdq.o endif ifdef USEXBLAS @@ -526,7 +526,7 @@ ZLASRC_O = \ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \ - zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o + zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o zgedmd.o zgedmdq.o endif ifdef USEXBLAS From c7a05458cd6d315ac3cc377c3760f59784d9e9e8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 15:52:44 +0200 Subject: [PATCH 141/718] Add Dynamic Mode Decomposition functions (Reference-LAPACK PR 736) --- lapack-netlib/LAPACKE/include/lapack.h | 132 +++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index a0fcaa259..a5d02b42e 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3323,6 +3323,138 @@ void LAPACK_zgesdd_base( #define LAPACK_zgesdd(...) LAPACK_zgesdd_base(__VA_ARGS__) #endif +#define LAPACK_cgedmd LAPACK_GLOBAL(cgedmd,CGEDMD) +void LAPACK_cgedmd( + char const* jobs, char const* jobz, char const* jobf, + lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, + lapack_complex_float* x, lapack_int const* ldx, + lapack_complex_float* y, lapack_int const* ldy, lapack_int const* k, + lapack_complex_float* reig, lapack_complex_float* imeig, + lapack_complex_float* z, lapack_int const* ldz, lapack_complex_float* res, + lapack_complex_float* b, lapack_int const* ldb, + lapack_complex_float* w, lapack_int const* ldw, + lapack_complex_float* s, lapack_int const* lds, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dgedmd LAPACK_GLOBAL(dgedmd,DGEDMD) +void LAPACK_dgedmd( + char const* jobs, char const* jobz, char const* jobf, + lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, + double* x, lapack_int const* ldx, + double* y, lapack_int const* ldy, lapack_int const* k, + double* reig, double* imeig, + double* z, lapack_int const* ldz, double* res, + double* b, lapack_int const* ldb, + double* w, lapack_int const* ldw, + double* s, lapack_int const* lds, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_sgedmd LAPACK_GLOBAL(sgedmd,SGEDMD) +void LAPACK_sgedmd( + char const* jobs, char const* jobz, char const* jobf, + lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, + float* x, lapack_int const* ldx, + float* y, lapack_int const* ldy, lapack_int const* k, + float* reig, float* imeig, + float* z, lapack_int const* ldz, float* res, + float* b, lapack_int const* ldb, + float* w, lapack_int const* ldw, + float* s, lapack_int const* lds, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zgedmd LAPACK_GLOBAL(zgedmd,ZGEDMD) +void LAPACK_zgedmd( + char const* jobs, char const* jobz, char const* jobf, + lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, + lapack_complex_double* x, lapack_int const* ldx, + lapack_complex_double* y, lapack_int const* ldy, lapack_int const* k, + lapack_complex_double* reig, lapack_complex_double* imeig, + lapack_complex_double* z, lapack_int const* ldz, lapack_complex_double* res, + lapack_complex_double* b, lapack_int const* ldb, + lapack_complex_double* w, lapack_int const* ldw, + lapack_complex_double* s, lapack_int const* lds, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_cgedmdq LAPACK_GLOBAL(cgedmdq,CGEDMDQ) +void LAPACK_cgedmdq( + char const* jobs, char const* jobz, char const* jobr, char const* jobq, + char const* jobt, char const* jobf, lapack_int const* whtsvd, + lapack_int const* m, lapack_int const* n, + lapack_complex_float* f, lapack_int const* ldf, + lapack_complex_float* x, lapack_int const* ldx, + lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk, + float const* tol, lapack_int const* k, + lapack_complex_float* reig, lapack_complex_float* imeig, + lapack_complex_float* z, lapack_int const* ldz, lapack_complex_float* res, + lapack_complex_float* b, lapack_int const* ldb, + lapack_complex_float* v, lapack_int const* ldv, + lapack_complex_float* s, lapack_int const* lds, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_dgedmdq LAPACK_GLOBAL(dgedmdq,DGEDMDQ) +void LAPACK_dgedmdq( + char const* jobs, char const* jobz, char const* jobr, char const* jobq, + char const* jobt, char const* jobf, lapack_int const* whtsvd, + lapack_int const* m, lapack_int const* n, + double* f, lapack_int const* ldf, + double* x, lapack_int const* ldx, + double* y, lapack_int const* ldy, lapack_int const* nrnk, + double const* tol, lapack_int const* k, + double* reig, double* imeig, + double* z, lapack_int const* ldz, double* res, + double* b, lapack_int const* ldb, + double* v, lapack_int const* ldv, + double* s, lapack_int const* lds, + double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_sgedmdq LAPACK_GLOBAL(sgedmdq,SGEDMDQ) +void LAPACK_sgedmdq( + char const* jobs, char const* jobz, char const* jobr, char const* jobq, + char const* jobt, char const* jobf, lapack_int const* whtsvd, + lapack_int const* m, lapack_int const* n, + float* f, lapack_int const* ldf, + float* x, lapack_int const* ldx, + float* y, lapack_int const* ldy, lapack_int const* nrnk, + float const* tol, lapack_int const* k, + float* reig, float* imeig, + float* z, lapack_int const* ldz, float* res, + float* b, lapack_int const* ldb, + float* v, lapack_int const* ldv, + float* s, lapack_int const* lds, + float* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ); + +#define LAPACK_zgedmdq LAPACK_GLOBAL(zgedmdq,ZGEDMDQ) +void LAPACK_zgedmdq( + char const* jobs, char const* jobz, char const* jobr, char const* jobq, + char const* jobt, char const* jobf, lapack_int const* whtsvd, + lapack_int const* m, lapack_int const* n, + lapack_complex_double* f, lapack_int const* ldf, + lapack_complex_double* x, lapack_int const* ldx, + lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk, + double const* tol, lapack_int const* k, + lapack_complex_double* reig, lapack_complex_double* imeig, + lapack_complex_double* z, lapack_int const* ldz, lapack_complex_double* res, + lapack_complex_double* b, lapack_int const* ldb, + lapack_complex_double* v, lapack_int const* ldv, + lapack_complex_double* s, lapack_int const* lds, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* iwork, lapack_int const* liwork, + lapack_int* info ) + #define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV) lapack_int LAPACK_cgesv( lapack_int const* n, lapack_int const* nrhs, From de88063aa2795fd25c00efeb92791b9c1d38c5be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 15:56:10 +0200 Subject: [PATCH 142/718] Add interfaces for Dynamic Mode Decomposition functions (Reference-LAPACK PR 736) --- lapack-netlib/LAPACKE/include/lapacke.h | 131 ++++++++++++++++++++++-- 1 file changed, 122 insertions(+), 9 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 9bd228064..9a9ab4753 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -956,7 +956,7 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, char jobr, lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v, lapack_int ldv, lapack_int* numrank ); - + lapack_int LAPACKE_sgesvj( int matrix_layout, char joba, char jobu, char jobv, lapack_int m, lapack_int n, float* a, lapack_int lda, float* sva, lapack_int mv, float* v, lapack_int ldv, @@ -5712,6 +5712,120 @@ lapack_int LAPACKE_zgesdd_work( int matrix_layout, char jobz, lapack_int m, lapack_complex_double* work, lapack_int lwork, double* rwork, lapack_int* iwork ); +lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, float* x, lapack_int ldx, + float* y, lapack_int ldy, lapack_int k, + float* reig, float* imeig, float* z, + lapack_int ldz, float* res, float* b, + lapack_int ldb, float* w, lapack_int ldw, + float* s, lapack_int lds, float* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ); + +lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, double* x, lapack_int ldx, + double* y, lapack_int ldy, lapack_int k, + double* reig, double* imeig, double* z, + lapack_int ldz, double* res, double* b, + lapack_int ldb, double* w, lapack_int ldw, + double* s, lapack_int lds, double* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ); + +lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_float* x, + lapack_int ldx, lapack_complex_float* y, + lapack_int ldy, lapack_int k, + lapack_complex_float* reig, + lapack_complex_float* imeig, + lapack_complex_float* z, lapack_int ldz, + lapack_complex_float* res, + lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* w, lapack_int ldw, + lapack_complex_float* s, lapack_int lds, + lapack_complex_float* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ); + +lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_double* x, + lapack_int ldx, lapack_complex_double* y, + lapack_int ldy, lapack_int k, + lapack_complex_double* reig, + lapack_complex_double* imeig, + lapack_complex_double* z, lapack_int ldz, + lapack_complex_double* res, + lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* w, lapack_int ldw, + lapack_complex_double* s, lapack_int lds, + lapack_complex_double* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ); + +lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + float* f, lapack_int ldf, float* x, + lapack_int ldx, float* y, lapack_int ldy, + lapack_int nrnk, float tol, lapack_int k, + float* reig, float* imeig, float* z, + lapack_int ldz, float* res, float* b, + lapack_int ldb, float* v, lapack_int ldv, + float* s, lapack_int lds, float* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ); + +lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + double* f, lapack_int ldf, double* x, + lapack_int ldx, double* y, lapack_int ldy, + lapack_int nrnk, double tol, lapack_int k, + double* reig, double* imeig, double* z, + lapack_int ldz, double* res, double* b, + lapack_int ldb, double* v, lapack_int ldv, + double* s, lapack_int lds, double* work, + lapack_int lwork, lapack_int* iwork, + lapack_int liwork ); + +lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + lapack_complex_float* f, lapack_int ldf, + lapack_complex_float* x, lapack_int ldx, + lapack_complex_float* y, lapack_int ldy, + lapack_int nrnk, float tol, lapack_int k, + lapack_complex_float* reig, + lapack_complex_float* imeig, + lapack_complex_float* z, lapack_int ldz, + lapack_complex_float* res, + lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* v, lapack_int ldv, + lapack_complex_float* s, lapack_int lds, + lapack_complex_float* work, lapack_int lwork, + lapack_int* iwork, + lapack_int liwork ); + +lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, + char jobr, char jobq, char jobt, char jobf, + lapack_int whtsvd, lapack_int m, lapack_int n, + lapack_complex_double* f, lapack_int ldf, + lapack_complex_double* x, lapack_int ldx, + lapack_complex_double* y, lapack_int ldy, + lapack_int nrnk, double tol, lapack_int k, + lapack_complex_double* reig, + lapack_complex_double* imeig, + lapack_complex_double* z, lapack_int ldz, + lapack_complex_double* res, + lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* v, lapack_int ldv, + lapack_complex_double* s, lapack_int lds, + lapack_complex_double* work, lapack_int lwork, + lapack_int* iwork, + lapack_int liwork ); + lapack_int LAPACKE_sgesv_work( int matrix_layout, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, lapack_int* ipiv, float* b, lapack_int ldb ); @@ -5833,7 +5947,7 @@ lapack_int LAPACKE_zgesvdq_work( int matrix_layout, char joba, char jobp, lapack_int* iwork, lapack_int liwork, lapack_complex_double* cwork, lapack_int lcwork, double* rwork, lapack_int lrwork); - + lapack_int LAPACKE_sgesvj_work( int matrix_layout, char joba, char jobu, char jobv, lapack_int m, lapack_int n, float* a, lapack_int lda, float* sva, lapack_int mv, @@ -12550,7 +12664,7 @@ lapack_int LAPACKE_zhegv_2stage_work( int matrix_layout, lapack_int itype, char //LAPACK 3.8.0 lapack_int LAPACKE_ssysv_aa_2stage( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, - float* tb, lapack_int ltb, lapack_int* ipiv, + float* tb, lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2, float* b, lapack_int ldb ); lapack_int LAPACKE_ssysv_aa_2stage_work( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, @@ -12560,7 +12674,7 @@ lapack_int LAPACKE_ssysv_aa_2stage_work( int matrix_layout, char uplo, lapack_in lapack_int LAPACKE_dsysv_aa_2stage( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda, double* tb, lapack_int ltb, - lapack_int* ipiv, lapack_int* ipiv2, + lapack_int* ipiv, lapack_int* ipiv2, double* b, lapack_int ldb ); lapack_int LAPACKE_dsysv_aa_2stage_work( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda, @@ -12612,10 +12726,10 @@ lapack_int LAPACKE_zhesv_aa_2stage_work( int matrix_layout, char uplo, lapack_in lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work, lapack_int lwork ); - + lapack_int LAPACKE_ssytrf_aa_2stage( int matrix_layout, char uplo, lapack_int n, float* a, lapack_int lda, - float* tb, lapack_int ltb, lapack_int* ipiv, + float* tb, lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2 ); lapack_int LAPACKE_ssytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_int n, float* a, lapack_int lda, @@ -12671,7 +12785,7 @@ lapack_int LAPACKE_zhetrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i lapack_int LAPACKE_ssytrs_aa_2stage( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, - float* tb, lapack_int ltb, lapack_int* ipiv, + float* tb, lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2, float* b, lapack_int ldb ); lapack_int LAPACKE_ssytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, @@ -12680,7 +12794,7 @@ lapack_int LAPACKE_ssytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i lapack_int LAPACKE_dsytrs_aa_2stage( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda, double* tb, lapack_int ltb, - lapack_int* ipiv, lapack_int* ipiv2, + lapack_int* ipiv, lapack_int* ipiv2, double* b, lapack_int ldb ); lapack_int LAPACKE_dsytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda, @@ -12727,7 +12841,6 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i lapack_int lda, lapack_complex_double* tb, lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2, lapack_complex_double* b, lapack_int ldb ); - //LAPACK 3.10.0 lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n, lapack_int nb, float* a, From 5eef0793bac9637b80f9ce02363d80abcda24840 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 18:56:33 +0200 Subject: [PATCH 143/718] Fix missing semicolon --- lapack-netlib/LAPACKE/include/lapack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index a5d02b42e..f510c8c80 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3453,7 +3453,7 @@ void LAPACK_zgedmdq( lapack_complex_double* s, lapack_int const* lds, lapack_complex_double* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ) + lapack_int* info ); #define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV) lapack_int LAPACK_cgesv( From 649ab6481adec7e4bb2a411a8d2956458523733f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 20:03:44 +0200 Subject: [PATCH 144/718] Add dummy C files for ?GEMDQ? --- lapack-netlib/SRC/cgedmd.c | 511 ++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/cgedmdq.c | 511 ++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dgedmd.c | 511 ++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dgedmdq.c | 511 ++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/sgedmd.c | 511 ++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/sgedmdq.c | 511 ++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/zgedmd.c | 511 ++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/zgedmdq.c | 511 ++++++++++++++++++++++++++++++++++++ 8 files changed, 4088 insertions(+) create mode 100644 lapack-netlib/SRC/cgedmd.c create mode 100644 lapack-netlib/SRC/cgedmdq.c create mode 100644 lapack-netlib/SRC/dgedmd.c create mode 100644 lapack-netlib/SRC/dgedmdq.c create mode 100644 lapack-netlib/SRC/sgedmd.c create mode 100644 lapack-netlib/SRC/sgedmdq.c create mode 100644 lapack-netlib/SRC/zgedmd.c create mode 100644 lapack-netlib/SRC/zgedmdq.c diff --git a/lapack-netlib/SRC/cgedmd.c b/lapack-netlib/SRC/cgedmd.c new file mode 100644 index 000000000..447b23014 --- /dev/null +++ b/lapack-netlib/SRC/cgedmd.c @@ -0,0 +1,511 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i Date: Tue, 20 Jun 2023 21:39:29 +0200 Subject: [PATCH 145/718] Add dummy C sources for ?GEDMD --- cmake/lapack.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 077390d90..12127531d 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -624,7 +624,7 @@ set(SLASRC ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c sgesvdq.c slaorhr_col_getrfnp.c slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c - slatrs3.c strsyl3.c sgelst.c) + slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c) set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c @@ -722,7 +722,7 @@ set(CLASRC chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c cungtsqr.c cungtsqr_row.c cunhr_col.c - clatrs3.c ctrsyl3.c cgelst.c) + clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c) set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c @@ -814,7 +814,7 @@ set(DLASRC dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c - dlatrs3.c dtrsyl3.c dgelst.c) + dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c) set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c @@ -925,7 +925,7 @@ set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c - zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c) + zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c zgedmd.c zgedmdq.c) if(USE_XBLAS) From 174f4e65e320241c7b57136db95d367cac7583d7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 20 Jun 2023 23:14:35 +0200 Subject: [PATCH 146/718] Add LAPACK/LAPACKE functions for Dynamic Mode Decomposition --- exports/gensymbol | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/exports/gensymbol b/exports/gensymbol index b584167a4..704eab06f 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -844,6 +844,23 @@ lapackobjs2z="$lapackobjs2z zungtsqr_row " +#functions added for lapack-3.11 +lapackobjs2c="$lapackobjs2c + cgedmd + cgedmdq + " +lapackobjs2d="$lapackobjs2d + dgedmd + dgedmdq + " +lapackobjs2s="$lapackobjs2s + sgedmd + sgedmdq + " +lapackobjs2z="$lapackobjs2z + zgedmd + zgedmdq + " lapack_extendedprecision_objs=" zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx @@ -1013,6 +1030,10 @@ lapackeobjsc=" LAPACKE_cgebrd_work LAPACKE_cgecon LAPACKE_cgecon_work + LAPACKE_cgedmd + LAPACKE_cgedmd_work + LAPACKE_cgedmdq + LAPACKE_cgedmdq_work LAPACKE_cgeequ LAPACKE_cgeequ_work LAPACKE_cgeequb @@ -1672,6 +1693,10 @@ lapackeobjsd=" LAPACKE_dgebrd_work LAPACKE_dgecon LAPACKE_dgecon_work + LAPACKE_dgedmd + LAPACKE_dgedmd_work + LAPACKE_dgedmdq + LAPACKE_dgedmdq_work LAPACKE_dgeequ LAPACKE_dgeequ_work LAPACKE_dgeequb @@ -2285,6 +2310,10 @@ lapackeobjss=" LAPACKE_sgebrd_work LAPACKE_sgecon LAPACKE_sgecon_work + LAPACKE_sgedmd + LAPACKE_sgedmd_work + LAPACKE_sgedmdq + LAPACKE_sgedmdq_work LAPACKE_sgeequ LAPACKE_sgeequ_work LAPACKE_sgeequb @@ -2894,6 +2923,10 @@ lapackeobjsz=" LAPACKE_zgebrd_work LAPACKE_zgecon LAPACKE_zgecon_work + LAPACKE_zgedmd + LAPACKE_zgedmd_work + LAPACKE_zgedmdq + LAPACKE_zgedmdq_work LAPACKE_zgeequ LAPACKE_zgeequ_work LAPACKE_zgeequb From 5cc1f7a0bab55bd8c10c9fceaa88d67a7246dede Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 21 Jun 2023 15:43:20 +0200 Subject: [PATCH 147/718] Add functional C replacements for ?GEDMD? --- lapack-netlib/SRC/cgedmd.c | 1159 ++++++++++++++++++++++++++++++++ lapack-netlib/SRC/cgedmdq.c | 778 ++++++++++++++++++++++ lapack-netlib/SRC/dgedmd.c | 1242 +++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dgedmdq.c | 789 ++++++++++++++++++++++ lapack-netlib/SRC/sgedmd.c | 1235 ++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/sgedmdq.c | 785 ++++++++++++++++++++++ lapack-netlib/SRC/zgedmd.c | 1165 ++++++++++++++++++++++++++++++++ lapack-netlib/SRC/zgedmdq.c | 782 ++++++++++++++++++++++ 8 files changed, 7935 insertions(+) diff --git a/lapack-netlib/SRC/cgedmd.c b/lapack-netlib/SRC/cgedmd.c index 447b23014..570395c7b 100644 --- a/lapack-netlib/SRC/cgedmd.c +++ b/lapack-netlib/SRC/cgedmd.c @@ -509,3 +509,1162 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static integer c_n1 = -1; +static integer c__1 = 1; +static integer c__0 = 0; + +/* Subroutine */ int cgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, + integer *whtsvd, integer *m, integer *n, complex *x, integer *ldx, + complex *y, integer *ldy, integer *nrnk, real *tol, integer *k, + complex *eigs, complex *z__, integer *ldz, real *res, complex *b, + integer *ldb, complex *w, integer *ldw, complex *s, integer *lds, + complex *zwork, integer *lzwork, real *rwork, integer *lrwork, + integer *iwork, integer *liwork, integer *info) +{ + /* System generated locals */ + integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, + b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2, i__3, + i__4, i__5; + real r__1, r__2; + complex q__1, q__2; + + /* Local variables */ + complex zone; + real zero, ssum; + integer info1, info2; + real xscl1, xscl2; + integer i__, j; + real scale; + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *), cgeev_(char *, + char *, integer *, complex *, integer *, complex *, complex *, + integer *, complex *, integer *, complex *, integer *, real *, + integer *); + extern logical lsame_(char *, char *); + logical badxy; + real small; + char jobzl[1]; + extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + integer *, complex *, integer *); + logical wntex; + complex zzero; + extern real scnrm2_(integer *, complex *, integer *); + extern /* Subroutine */ int cgesdd_(char *, integer *, integer *, complex + *, integer *, real *, complex *, integer *, complex *, integer *, + complex *, integer *, real *, integer *, integer *), + clascl_(char *, integer *, integer *, real *, real *, integer *, + integer *, complex *, integer *, integer *); + extern integer icamax_(integer *, complex *, integer *); + extern real slamch_(char *); + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + *), cgesvd_(char *, char *, integer *, integer *, complex *, + integer *, real *, complex *, integer *, complex *, integer *, + complex *, integer *, real *, integer *), clacpy_( + char *, integer *, integer *, complex *, integer *, complex *, + integer *), xerbla_(char *, integer *); + char t_or_n__[1]; + extern /* Subroutine */ int cgejsv_(char *, char *, char *, char *, char * + , char *, integer *, integer *, complex *, integer *, real *, + complex *, integer *, complex *, integer *, complex *, integer *, + real *, integer *, integer *, integer *), classq_(integer *, complex *, integer *, + real *, real *); + logical sccolx, sccoly; + extern logical sisnan_(real *); + integer lwrsdd, mwrsdd, iminwr; + logical wntref, wntvec; + real rootsc; + integer lwrkev, mlwork, mwrkev, numrnk, olwork, lwrsvd, mwrsvd, mlrwrk; + logical lquery, wntres; + char jsvopt[1]; + integer lwrsvj, mwrsvj; + real rdummy[2]; + extern /* Subroutine */ int mecago_(); + integer lwrsvq, mwrsvq; + real ofl, one; + extern /* Subroutine */ int cgesvdq_(char *, char *, char *, char *, char + *, integer *, integer *, complex *, integer *, real *, complex *, + integer *, complex *, integer *, integer *, integer *, integer *, + complex *, integer *, real *, integer *, integer *); + +/* March 2023 */ +/* ..... */ +/* USE iso_fortran_env */ +/* INTEGER, PARAMETER :: WP = real32 */ +/* ..... */ +/* Scalar arguments */ +/* Array arguments */ +/* ............................................................ */ +/* Purpose */ +/* ======= */ +/* CGEDMD computes the Dynamic Mode Decomposition (DMD) for */ +/* a pair of data snapshot matrices. For the input matrices */ +/* X and Y such that Y = A*X with an unaccessible matrix */ +/* A, CGEDMD computes a certain number of Ritz pairs of A using */ +/* the standard Rayleigh-Ritz extraction from a subspace of */ +/* range(X) that is determined using the leading left singular */ +/* vectors of X. Optionally, CGEDMD returns the residuals */ +/* of the computed Ritz pairs, the information needed for */ +/* a refinement of the Ritz vectors, or the eigenvectors of */ +/* the Exact DMD. */ +/* For further details see the references listed */ +/* below. For more details of the implementation see [3]. */ + +/* References */ +/* ========== */ +/* [1] P. Schmid: Dynamic mode decomposition of numerical */ +/* and experimental data, */ +/* Journal of Fluid Mechanics 656, 5-28, 2010. */ +/* [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */ +/* decompositions: analysis and enhancements, */ +/* SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */ +/* [3] Z. Drmac: A LAPACK implementation of the Dynamic */ +/* Mode Decomposition I. Technical report. AIMDyn Inc. */ +/* and LAPACK Working Note 298. */ +/* [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */ +/* Brunton, N. Kutz: On Dynamic Mode Decomposition: */ +/* Theory and Applications, Journal of Computational */ +/* Dynamics 1(2), 391 -421, 2014. */ + +/* ...................................................................... */ +/* Developed and supported by: */ +/* =========================== */ +/* Developed and coded by Zlatko Drmac, Faculty of Science, */ +/* University of Zagreb; drmac@math.hr */ +/* In cooperation with */ +/* AIMdyn Inc., Santa Barbara, CA. */ +/* and supported by */ +/* - DARPA SBIR project "Koopman Operator-Based Forecasting */ +/* for Nonstationary Processes from Near-Term, Limited */ +/* Observational Data" Contract No: W31P4Q-21-C-0007 */ +/* - DARPA PAI project "Physics-Informed Machine Learning */ +/* Methodologies" Contract No: HR0011-18-9-0033 */ +/* - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */ +/* Framework for Space-Time Analysis of Process Dynamics" */ +/* Contract No: HR0011-16-C-0116 */ +/* Any opinions, findings and conclusions or recommendations */ +/* expressed in this material are those of the author and */ +/* do not necessarily reflect the views of the DARPA SBIR */ +/* Program Office */ +/* ============================================================ */ +/* Distribution Statement A: */ +/* Approved for Public Release, Distribution Unlimited. */ +/* Cleared by DARPA on September 29, 2022 */ +/* ============================================================ */ +/* ...................................................................... */ +/* Arguments */ +/* ========= */ +/* JOBS (input) CHARACTER*1 */ +/* Determines whether the initial data snapshots are scaled */ +/* by a diagonal matrix. */ +/* 'S' :: The data snapshots matrices X and Y are multiplied */ +/* with a diagonal matrix D so that X*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'C' :: The snapshots are scaled as with the 'S' option. */ +/* If it is found that an i-th column of X is zero */ +/* vector and the corresponding i-th column of Y is */ +/* non-zero, then the i-th column of Y is set to */ +/* zero and a warning flag is raised. */ +/* 'Y' :: The data snapshots matrices X and Y are multiplied */ +/* by a diagonal matrix D so that Y*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'N' :: No data scaling. */ +/* ..... */ +/* JOBZ (input) CHARACTER*1 */ +/* Determines whether the eigenvectors (Koopman modes) will */ +/* be computed. */ +/* 'V' :: The eigenvectors (Koopman modes) will be computed */ +/* and returned in the matrix Z. */ +/* See the description of Z. */ +/* 'F' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product X(:,1:K)*W, where X */ +/* contains a POD basis (leading left singular vectors */ +/* of the data matrix X) and W contains the eigenvectors */ +/* of the corresponding Rayleigh quotient. */ +/* See the descriptions of K, X, W, Z. */ +/* 'N' :: The eigenvectors are not computed. */ +/* ..... */ +/* JOBR (input) CHARACTER*1 */ +/* Determines whether to compute the residuals. */ +/* 'R' :: The residuals for the computed eigenpairs will be */ +/* computed and stored in the array RES. */ +/* See the description of RES. */ +/* For this option to be legal, JOBZ must be 'V'. */ +/* 'N' :: The residuals are not computed. */ +/* ..... */ +/* JOBF (input) CHARACTER*1 */ +/* Specifies whether to store information needed for post- */ +/* processing (e.g. computing refined Ritz vectors) */ +/* 'R' :: The matrix needed for the refinement of the Ritz */ +/* vectors is computed and stored in the array B. */ +/* See the description of B. */ +/* 'E' :: The unscaled eigenvectors of the Exact DMD are */ +/* computed and returned in the array B. See the */ +/* description of B. */ +/* 'N' :: No eigenvector refinement data is computed. */ +/* ..... */ +/* WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */ +/* Allows for a selection of the SVD algorithm from the */ +/* LAPACK library. */ +/* 1 :: CGESVD (the QR SVD algorithm) */ +/* 2 :: CGESDD (the Divide and Conquer algorithm; if enough */ +/* workspace available, this is the fastest option) */ +/* 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 */ +/* are the most accurate options) */ +/* 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 */ +/* are the most accurate options) */ +/* For the four methods above, a significant difference in */ +/* the accuracy of small singular values is possible if */ +/* the snapshots vary in norm so that X is severely */ +/* ill-conditioned. If small (smaller than EPS*||X||) */ +/* singular values are of interest and JOBS=='N', then */ +/* the options (3, 4) give the most accurate results, where */ +/* the option 4 is slightly better and with stronger */ +/* theoretical background. */ +/* If JOBS=='S', i.e. the columns of X will be normalized, */ +/* then all methods give nearly equally accurate results. */ +/* ..... */ +/* M (input) INTEGER, M>= 0 */ +/* The state space dimension (the row dimension of X, Y). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshot pairs */ +/* (the number of columns of X and Y). */ +/* ..... */ +/* X (input/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, X contains the data snapshot matrix X. It is */ +/* assumed that the column norms of X are in the range of */ +/* the normalized floating point numbers. */ +/* < On exit, the leading K columns of X contain a POD basis, */ +/* i.e. the leading K left singular vectors of the input */ +/* data matrix X, U(:,1:K). All N columns of X contain all */ +/* left singular vectors of the input matrix X. */ +/* See the descriptions of K, Z and W. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= M */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, Y contains the data snapshot matrix Y */ +/* < On exit, */ +/* If JOBR == 'R', the leading K columns of Y contain */ +/* the residual vectors for the computed Ritz pairs. */ +/* See the description of RES. */ +/* If JOBR == 'N', Y contains the original input data, */ +/* scaled according to the value of JOBS. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= M */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the descriptions of TOL and K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the POD basis for the data snapshot */ +/* matrix X and the number of the computed Ritz pairs. */ +/* The value of K is determined according to the rule set */ +/* by the parameters NRNK and TOL. */ +/* See the descriptions of NRNK and TOL. */ +/* ..... */ +/* EIGS (output) COMPLEX(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of EIGS contain */ +/* the computed eigenvalues (Ritz values). */ +/* See the descriptions of K, and Z. */ +/* ..... */ +/* Z (workspace/output) COMPLEX(KIND=WP) M-by-N array */ +/* If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */ +/* is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */ +/* the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) */ +/* is an eigenvector corresponding to EIGS(i). The columns */ +/* of W(1:k,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. */ +/* See the descriptions of EIGS, X and W. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) N-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs, */ +/* RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */ +/* See the description of EIGS and Z. */ +/* ..... */ +/* B (output) COMPLEX(KIND=WP) M-by-N array. */ +/* IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:M,1:K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* If JOBF =='N', then B is not referenced. */ +/* See the descriptions of X, W, K. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= M */ +/* The leading dimension of the array B. */ +/* ..... */ +/* W (workspace/output) COMPLEX(KIND=WP) N-by-N array */ +/* On exit, W(1:K,1:K) contains the K computed */ +/* eigenvectors of the matrix Rayleigh quotient. */ +/* The Ritz vectors (returned in Z) are the */ +/* product of X (containing a POD basis for the input */ +/* matrix X) and W. See the descriptions of K, S, X and Z. */ +/* W is also used as a workspace to temporarily store the */ +/* right singular vectors of X. */ +/* ..... */ +/* LDW (input) INTEGER, LDW >= N */ +/* The leading dimension of the array W. */ +/* ..... */ +/* S (workspace/output) COMPLEX(KIND=WP) N-by-N array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by CGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N */ +/* The leading dimension of the array S. */ +/* ..... */ +/* ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array */ +/* ZWORK is used as complex workspace in the complex SVD, as */ +/* specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing */ +/* the eigenvalues of a Rayleigh quotient. */ +/* If the call to CGEDMD is only workspace query, then */ +/* ZWORK(1) contains the minimal complex workspace length and */ +/* ZWORK(2) is the optimal complex workspace length. */ +/* Hence, the length of work is at least 2. */ +/* See the description of LZWORK. */ +/* ..... */ +/* LZWORK (input) INTEGER */ +/* The minimal length of the workspace vector ZWORK. */ +/* LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), */ +/* where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal */ +/* LZWORK_SVD is calculated as follows */ +/* If WHTSVD == 1 :: CGESVD :: */ +/* LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */ +/* If WHTSVD == 2 :: CGESDD :: */ +/* LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) */ +/* If WHTSVD == 3 :: CGESVDQ :: */ +/* LZWORK_SVD = obtainable by a query */ +/* If WHTSVD == 4 :: CGEJSV :: */ +/* LZWORK_SVD = obtainable by a query */ +/* If on entry LZWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths and returns them in */ +/* LZWORK(1) and LZWORK(2), respectively. */ +/* ..... */ +/* RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array */ +/* On exit, RWORK(1:N) contains the singular values of */ +/* X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */ +/* If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain */ +/* scaling factor RWORK(N+2)/RWORK(N+1) used to scale X */ +/* and Y to avoid overflow in the SVD of X. */ +/* This may be of interest if the scaling option is off */ +/* and as many as possible smallest eigenvalues are */ +/* desired to the highest feasible accuracy. */ +/* If the call to CGEDMD is only workspace query, then */ +/* RWORK(1) contains the minimal workspace length. */ +/* See the description of LRWORK. */ +/* ..... */ +/* LRWORK (input) INTEGER */ +/* The minimal length of the workspace vector RWORK. */ +/* LRWORK is calculated as follows: */ +/* LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where */ +/* LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace */ +/* for the SVD subroutine determined by the input parameter */ +/* WHTSVD. */ +/* If WHTSVD == 1 :: CGESVD :: */ +/* LRWORK_SVD = 5*MIN(M,N) */ +/* If WHTSVD == 2 :: CGESDD :: */ +/* LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), */ +/* 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) */ +/* If WHTSVD == 3 :: CGESVDQ :: */ +/* LRWORK_SVD = obtainable by a query */ +/* If WHTSVD == 4 :: CGEJSV :: */ +/* LRWORK_SVD = obtainable by a query */ +/* If on entry LRWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* real workspace length and returns it in RWORK(1). */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for ZWORK, RWORK and */ +/* IWORK. See the descriptions of ZWORK, RWORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* Local arrays */ +/* ~~~~~~~~~~~~ */ +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* ............................................................ */ + /* Parameter adjustments */ + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --eigs; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + w_dim1 = *ldw; + w_offset = 1 + w_dim1 * 1; + w -= w_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --zwork; + --rwork; + --iwork; + + /* Function Body */ + zero = 0.f; + one = 1.f; + zzero.r = 0.f, zzero.i = 0.f; + zone.r = 1.f, zone.i = 0.f; + +/* Test the input arguments */ + + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + *info = 0; + lquery = *lzwork == -1 || *liwork == -1 || *lrwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || lsame_(jobz, "N") || lsame_( + jobz, "F"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && ! wntvec) { + *info = -3; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -4; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -5; + } else if (*m < 0) { + *info = -6; + } else if (*n < 0 || *n > *m) { + *info = -7; + } else if (*ldx < *m) { + *info = -9; + } else if (*ldy < *m) { + *info = -11; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -12; + } else if (*tol < zero || *tol >= one) { + *info = -13; + } else if (*ldz < *m) { + *info = -17; + } else if ((wntref || wntex) && *ldb < *m) { + *info = -20; + } else if (*ldw < *n) { + *info = -22; + } else if (*lds < *n) { + *info = -24; + } + + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0) { +/* Quick return. All output except K is void. */ +/* INFO=1 signals the void input. */ +/* In case of a workspace query, the default */ +/* minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + rwork[1] = 1.f; + zwork[1].r = 2.f, zwork[1].i = 0.f; + zwork[2].r = 2.f, zwork[2].i = 0.f; + } else { + *k = 0; + } + *info = 1; + return 0; + } + iminwr = 1; + mlrwrk = f2cmax(1,*n); + mlwork = 2; + olwork = 2; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of CGESVD: */ +/* MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */ +/* Computing MAX */ + i__1 = 1, i__2 = (f2cmin(*m,*n) << 1) + f2cmax(*m,*n); + mwrsvd = f2cmax(i__1,i__2); + mlwork = f2cmax(mlwork,mwrsvd); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + f2cmin(*m,*n) * 5; + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + cgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[ + b_offset], ldb, &w[w_offset], ldw, &zwork[1], &c_n1, + rdummy, &info1); + lwrsvd = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvd); + } + } else if (*whtsvd == 2) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of CGESDD: */ +/* MWRSDD = 2*f2cmin(M,N)*f2cmin(M,N)+2*f2cmin(M,N)+f2cmax(M,N). */ +/* RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) */ +/* In LAPACK 3.10.1 RWORK is defined differently. */ +/* Below we take f2cmax over the two versions. */ +/* IMINWR = 8*MIN(M,N) */ + mwrsdd = (f2cmin(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) << 1) + f2cmax( + *m,*n); + mlwork = f2cmax(mlwork,mwrsdd); + iminwr = f2cmin(*m,*n) << 3; +/* Computing MAX */ +/* Computing MAX */ + i__3 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 7, i__4 = f2cmin(* + m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 5, i__3 = f2cmax(i__3, + i__4), i__4 = (f2cmax(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) + << 1) * f2cmin(*m,*n) + f2cmin(*m,*n); + i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + cgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], + ldb, &w[w_offset], ldw, &zwork[1], &c_n1, rdummy, & + iwork[1], &info1); +/* Computing MAX */ + i__1 = mwrsdd, i__2 = (integer) zwork[1].r; + lwrsdd = f2cmax(i__1,i__2); + olwork = f2cmax(olwork,lwrsdd); + } + } else if (*whtsvd == 3) { + cgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[ + 1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, & + iwork[1], &c_n1, &zwork[1], &c_n1, rdummy, &c_n1, &info1); + iminwr = iwork[1]; + mwrsvq = (integer) zwork[2].r; + mlwork = f2cmax(mlwork,mwrsvq); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + (integer) rdummy[0]; + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + lwrsvq = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvq); + } + } else if (*whtsvd == 4) { + *(unsigned char *)jsvopt = 'J'; + cgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, + &rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[ + 1], &c_n1, rdummy, &c_n1, &iwork[1], &info1); + iminwr = iwork[1]; + mwrsvj = (integer) zwork[2].r; + mlwork = f2cmax(mlwork,mwrsvj); +/* Computing MAX */ +/* Computing MAX */ + i__3 = 7, i__4 = (integer) rdummy[0]; + i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + lwrsvj = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvj); + } +/* END SELECT */ + } + if (wntvec || wntex || lsame_(jobz, "F")) { + *(unsigned char *)jobzl = 'V'; + } else { + *(unsigned char *)jobzl = 'N'; + } +/* Workspace calculation to the CGEEV call */ +/* Computing MAX */ + i__1 = 1, i__2 = *n << 1; + mwrkev = f2cmax(i__1,i__2); + mlwork = f2cmax(mlwork,mwrkev); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + (*n << 1); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + cgeev_("N", jobzl, n, &s[s_offset], lds, &eigs[1], &w[w_offset], + ldw, &w[w_offset], ldw, &zwork[1], &c_n1, &rwork[1], & + info1); +/* LAPACK CALL */ + lwrkev = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrkev); + olwork = f2cmax(2,olwork); + } + + if (*liwork < iminwr && ! lquery) { + *info = -30; + } + if (*lrwork < mlrwrk && ! lquery) { + *info = -28; + } + if (*lzwork < mlwork && ! lquery) { + *info = -26; + } + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGEDMD", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + rwork[1] = (real) mlrwrk; + zwork[1].r = (real) mlwork, zwork[1].i = 0.f; + zwork[2].r = (real) olwork, zwork[2].i = 0.f; + return 0; + } +/* ............................................................ */ + + ofl = slamch_("O") * slamch_("P"); + small = slamch_("S"); + badxy = FALSE_; + +/* <1> Optional scaling of the snapshots (columns of X, Y) */ +/* ========================================================== */ + if (sccolx) { +/* The columns of X will be normalized. */ +/* To prevent overflows, the column norms of X are */ +/* carefully computed using CLASSQ. */ + *k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = SCNRM2( M, X(1,i), 1 ) */ + scale = zero; + classq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum); + if (sisnan_(&scale) || sisnan_(&ssum)) { + *k = 0; + *info = -8; + i__2 = -(*info); + xerbla_("CGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of X(:,i) overflows. First, X(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */ +/* Next, the norm of X(:,i) is stored without */ +/* overflow as WORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of X(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + r__1 = one / rootsc; + clascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &x[ + i__ * x_dim1 + 1], ldx, &info2); + rwork[i__] = -scale * (rootsc / (real) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + rwork[i__] = scale * rootsc; + clascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, & + x[i__ * x_dim1 + 1], ldx, &info2); +/* X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC */ +/* LAPAC */ + } + } else { + rwork[i__] = zero; + ++(*k); + } + } + if (*k == *n) { +/* All columns of X are zero. Return error code -8. */ +/* (the 8th input variable had an illegal value) */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("CGEDMD", &i__1); + return 0; + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of Y. */ + if (rwork[i__] > zero) { + r__1 = one / rwork[i__]; + csscal_(m, &r__1, &y[i__ * y_dim1 + 1], &c__1); +/* Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (rwork[i__] < zero) { + r__1 = -rwork[i__]; + r__2 = one / (real) (*m); + clascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &y[i__ * + y_dim1 + 1], ldy, &info2); +/* LAPACK C */ + } else if (c_abs(&y[icamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ + * y_dim1]) != zero) { +/* X(:,i) is zero vector. For consistency, */ +/* Y(:,i) should also be zero. If Y(:,i) is not */ +/* zero, then the data might be inconsistent or */ +/* corrupted. If JOBS == 'C', Y(:,i) is set to */ +/* zero and a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + if (lsame_(jobs, "C")) { + csscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1); + } +/* BLAS CALL */ + } + } + } + + if (sccoly) { +/* The columns of Y will be normalized. */ +/* To prevent overflows, the column norms of Y are */ +/* carefully computed using CLASSQ. */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* RWORK(i) = SCNRM2( M, Y(1,i), 1 ) */ + scale = zero; + classq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum); + if (sisnan_(&scale) || sisnan_(&ssum)) { + *k = 0; + *info = -10; + i__2 = -(*info); + xerbla_("CGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of Y(:,i) overflows. First, Y(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */ +/* Next, the norm of Y(:,i) is stored without */ +/* overflow as RWORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of Y(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + r__1 = one / rootsc; + clascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &y[ + i__ * y_dim1 + 1], ldy, &info2); + rwork[i__] = -scale * (rootsc / (real) (*m)); + } else { +/* Y(:,i) will be scaled to unit 2-norm */ + rwork[i__] = scale * rootsc; + clascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, & + y[i__ * y_dim1 + 1], ldy, &info2); +/* Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* LAPA */ + } + } else { + rwork[i__] = zero; + } + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of X. */ + if (rwork[i__] > zero) { + r__1 = one / rwork[i__]; + csscal_(m, &r__1, &x[i__ * x_dim1 + 1], &c__1); +/* X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (rwork[i__] < zero) { + r__1 = -rwork[i__]; + r__2 = one / (real) (*m); + clascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &x[i__ * + x_dim1 + 1], ldx, &info2); +/* LAPACK */ + } else if (c_abs(&x[icamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ + * x_dim1]) != zero) { +/* Y(:,i) is zero vector. If X(:,i) is not */ +/* zero, then a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + } + } + } + +/* <2> SVD of the data snapshot matrix X. */ +/* ===================================== */ +/* The left singular vectors are stored in the array X. */ +/* The right singular vectors are in the array W. */ +/* The array W will later on contain the eigenvectors */ +/* of a Rayleigh quotient. */ + numrnk = *n; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { + cgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], + ldb, &w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], & + info1); +/* LA */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 2) { + cgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], ldb, & + w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &iwork[1] + , &info1); +/* LAP */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 3) { + i__1 = *lrwork - *n; + cgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[1], + &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], + liwork, &zwork[1], lzwork, &rwork[*n + 1], &i__1, &info1); +/* LAPACK CA */ + clacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK C */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 4) { + i__1 = *lrwork - *n; + cgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, & + rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[1], + lzwork, &rwork[*n + 1], &i__1, &iwork[1], &info1); + clacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK CALL */ + *(unsigned char *)t_or_n__ = 'N'; + xscl1 = rwork[*n + 1]; + xscl2 = rwork[*n + 2]; + if (xscl1 != xscl2) { +/* This is an exceptional situation. If the */ +/* data matrices are not scaled and the */ +/* largest singular value of X overflows. */ +/* In that case CGEJSV can return the SVD */ +/* in scaled form. The scaling factor can be used */ +/* to rescale the data (X and Y). */ + clascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], + ldy, &info2); + } +/* END SELECT */ + } + + if (info1 > 0) { +/* The SVD selected subroutine did not converge. */ +/* Return with an error code. */ + *info = 2; + return 0; + } + + if (rwork[1] == zero) { +/* The largest computed singular value of (scaled) */ +/* X is zero. Return error code -8 */ +/* (the 8th input variable had an illegal value). */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("CGEDMD", &i__1); + return 0; + } + +/* <3> Determine the numerical rank of the data */ +/* snapshots matrix X. This depends on the */ +/* parameters NRNK and TOL. */ +/* SELECT CASE ( NRNK ) */ + if (*nrnk == -1) { + *k = 1; + i__1 = numrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (rwork[i__] <= rwork[1] * *tol || rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else if (*nrnk == -2) { + *k = 1; + i__1 = numrnk - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + if (rwork[i__ + 1] <= rwork[i__] * *tol || rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else { + *k = 1; + i__1 = *nrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } +/* END SELECT */ + } +/* Now, U = X(1:M,1:K) is the SVD/POD basis for the */ +/* snapshot data in the input matrix X. */ +/* <4> Compute the Rayleigh quotient S = U^H * A * U. */ +/* Depending on the requested outputs, the computation */ +/* is organized to compute additional auxiliary */ +/* matrices (for the residuals and refinements). */ + +/* In all formulas below, we need V_k*Sigma_k^(-1) */ +/* where either V_k is in W(1:N,1:K), or V_k^H is in */ +/* W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */ + if (lsame_(t_or_n__, "N")) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + r__1 = one / rwork[i__]; + csscal_(n, &r__1, &w[i__ * w_dim1 + 1], &c__1); +/* W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i) ! INTRINSIC */ +/* BLAS CALL */ + } + } else { +/* This non-unit stride access is due to the fact */ +/* that CGESVD, CGESVDQ and CGESDD return the */ +/* adjoint matrix of the right singular vectors. */ +/* DO i = 1, K */ +/* CALL DSCAL( N, ONE/RWORK(i), W(i,1), LDW ) ! BLAS CALL */ +/* ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N) ! INTRINSIC */ +/* END DO */ + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + rwork[*n + i__] = one / rwork[i__]; + } + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *k; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * w_dim1; + i__4 = *n + i__; + q__2.r = rwork[i__4], q__2.i = zero; + i__5 = i__ + j * w_dim1; + q__1.r = q__2.r * w[i__5].r - q__2.i * w[i__5].i, q__1.i = + q__2.r * w[i__5].i + q__2.i * w[i__5].r; + w[i__3].r = q__1.r, w[i__3].i = q__1.i; + } + } + } + + if (wntref) { + +/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */ +/* for computing the refined Ritz vectors */ +/* (optionally, outside CGEDMD). */ + cgemm_("N", t_or_n__, m, k, n, &zone, &y[y_offset], ldy, &w[w_offset], + ldw, &zzero, &z__[z_offset], ldz); +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRI */ +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRI */ + +/* At this point Z contains */ +/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */ +/* this is needed for computing the residuals. */ +/* This matrix is returned in the array B and */ +/* it can be used to compute refined Ritz vectors. */ +/* BLAS */ + clacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); +/* B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC */ +/* BLAS CALL */ + cgemm_("C", "N", k, k, m, &zone, &x[x_offset], ldx, &z__[z_offset], + ldz, &zzero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */ +/* At this point S = U^H * A * U is the Rayleigh quotient. */ +/* BLAS */ + } else { +/* A * U(:,1:K) is not explicitly needed and the */ +/* computation is organized differently. The Rayleigh */ +/* quotient is computed more efficiently. */ + cgemm_("C", "N", k, n, m, &zone, &x[x_offset], ldx, &y[y_offset], ldy, + &zzero, &z__[z_offset], ldz); +/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! IN */ + +/* B */ + cgemm_("N", t_or_n__, k, k, n, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */ +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRIN */ +/* At this point S = U^H * A * U is the Rayleigh quotient. */ +/* If the residuals are requested, save scaled V_k into Z. */ +/* Recall that V_k or V_k^H is stored in W. */ +/* BLAS */ + if (wntres || wntex) { + if (lsame_(t_or_n__, "N")) { + clacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz); + } else { + clacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz); + } + } + } + +/* <5> Compute the Ritz values and (if requested) the */ +/* right eigenvectors of the Rayleigh quotient. */ + + cgeev_("N", jobzl, k, &s[s_offset], lds, &eigs[1], &w[w_offset], ldw, &w[ + w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &info1); + +/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */ +/* quotient. See the description of Z. */ +/* Also, see the description of CGEEV. */ +/* LAPACK CA */ + if (info1 > 0) { +/* CGEEV failed to compute the eigenvalues and */ +/* eigenvectors of the Rayleigh quotient. */ + *info = 3; + return 0; + } + +/* <6> Compute the eigenvectors (if requested) and, */ +/* the residuals (if requested). */ + + if (wntvec || wntex) { + if (wntres) { + if (wntref) { +/* Here, if the refinement is requested, we have */ +/* A*U(:,1:K) already computed and stored in Z. */ +/* For the residuals, need Y = A * U(:,1;K) * W. */ + cgemm_("N", "N", m, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &y[y_offset], ldy); +/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC */ +/* This frees Z; Y contains A * U(:,1:K) * W. */ +/* BLAS CALL */ + } else { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) (or its adjoint) is stored in Z */ + cgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + cgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[ + s_offset], lds, &zzero, &z__[z_offset], ldz); +/* Save a copy of Z into Y and free Z for holding */ +/* the Ritz vectors. */ + clacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy); + if (wntex) { + clacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); + } + } + } else if (wntex) { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + cgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + cgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[s_offset], + lds, &zzero, &b[b_offset], ldb); +/* The above call replaces the following two calls */ +/* that were used in the developing-testing phase. */ +/* CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & */ +/* LDS, ZZERO, Z, LDZ) */ +/* Save a copy of Z into Y and free Z for holding */ +/* the Ritz vectors. */ +/* CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) */ + } + +/* Compute the Ritz vectors */ + if (wntvec) { + cgemm_("N", "N", m, k, k, &zone, &x[x_offset], ldx, &w[w_offset], + ldw, &zzero, &z__[z_offset], ldz); + } +/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRIN */ + +/* BLAS CALL */ + if (wntres) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + q__1.r = -eigs[i__2].r, q__1.i = -eigs[i__2].i; + caxpy_(m, &q__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * + y_dim1 + 1], &c__1); +/* Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i) ! */ + + res[i__] = scnrm2_(m, &y[i__ * y_dim1 + 1], &c__1); + + } + } + } + + if (*whtsvd == 4) { + rwork[*n + 1] = xscl1; + rwork[*n + 2] = xscl2; + } + +/* Successful exit. */ + if (! badxy) { + *info = 0; + } else { +/* A warning on possible data inconsistency. */ +/* This should be a rare event. */ + *info = 4; + } +/* ............................................................ */ + return 0; +/* ...... */ +} /* cgedmd_ */ + diff --git a/lapack-netlib/SRC/cgedmdq.c b/lapack-netlib/SRC/cgedmdq.c index 447b23014..6e3a1faca 100644 --- a/lapack-netlib/SRC/cgedmdq.c +++ b/lapack-netlib/SRC/cgedmdq.c @@ -509,3 +509,781 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static integer c_n1 = -1; + +/* Subroutine */ int cgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, + char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, + complex *f, integer *ldf, complex *x, integer *ldx, complex *y, + integer *ldy, integer *nrnk, real *tol, integer *k, complex *eigs, + complex *z__, integer *ldz, real *res, complex *b, integer *ldb, + complex *v, integer *ldv, complex *s, integer *lds, complex *zwork, + integer *lzwork, real *work, integer *lwork, integer *iwork, integer * + liwork, integer *info) +{ + /* System generated locals */ + integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, + z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, + i__1, i__2; + + /* Local variables */ + real zero; + integer info1; + extern logical lsame_(char *, char *); + char jobvl[1]; + integer minmn; + logical wantq; + integer mlwqr, olwqr; + logical wntex; + complex zzero; + extern /* Subroutine */ int cgedmd_(char *, char *, char *, char *, + integer *, integer *, integer *, complex *, integer *, complex *, + integer *, integer *, real *, integer *, complex *, complex *, + integer *, real *, complex *, integer *, complex *, integer *, + complex *, integer *, complex *, integer *, real *, integer *, + integer *, integer *, integer *), + cgeqrf_(integer *, integer *, complex *, integer *, complex *, + complex *, integer *, integer *), clacpy_(char *, integer *, + integer *, complex *, integer *, complex *, integer *), + claset_(char *, integer *, integer *, complex *, complex *, + complex *, integer *), xerbla_(char *, integer *); + integer mlwdmd, olwdmd; + logical sccolx, sccoly; + extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *, integer *); + integer iminwr; + logical wntvec, wntvcf; + integer mlwgqr; + logical wntref; + integer mlwork, olwgqr, olwork, mlrwrk, mlwmqr, olwmqr; + logical lquery, wntres, wnttrf, wntvcq; + extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + integer *, complex *, integer *, complex *, complex *, integer *, + complex *, integer *, integer *); + real one; + +/* March 2023 */ +/* ..... */ +/* USE iso_fortran_env */ +/* INTEGER, PARAMETER :: WP = real32 */ +/* ..... */ +/* Scalar arguments */ +/* Array arguments */ +/* ..... */ +/* Purpose */ +/* ======= */ +/* CGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */ +/* a pair of data snapshot matrices, using a QR factorization */ +/* based compression of the data. For the input matrices */ +/* X and Y such that Y = A*X with an unaccessible matrix */ +/* A, CGEDMDQ computes a certain number of Ritz pairs of A using */ +/* the standard Rayleigh-Ritz extraction from a subspace of */ +/* range(X) that is determined using the leading left singular */ +/* vectors of X. Optionally, CGEDMDQ returns the residuals */ +/* of the computed Ritz pairs, the information needed for */ +/* a refinement of the Ritz vectors, or the eigenvectors of */ +/* the Exact DMD. */ +/* For further details see the references listed */ +/* below. For more details of the implementation see [3]. */ + +/* References */ +/* ========== */ +/* [1] P. Schmid: Dynamic mode decomposition of numerical */ +/* and experimental data, */ +/* Journal of Fluid Mechanics 656, 5-28, 2010. */ +/* [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */ +/* decompositions: analysis and enhancements, */ +/* SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */ +/* [3] Z. Drmac: A LAPACK implementation of the Dynamic */ +/* Mode Decomposition I. Technical report. AIMDyn Inc. */ +/* and LAPACK Working Note 298. */ +/* [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */ +/* Brunton, N. Kutz: On Dynamic Mode Decomposition: */ +/* Theory and Applications, Journal of Computational */ +/* Dynamics 1(2), 391 -421, 2014. */ + +/* Developed and supported by: */ +/* =========================== */ +/* Developed and coded by Zlatko Drmac, Faculty of Science, */ +/* University of Zagreb; drmac@math.hr */ +/* In cooperation with */ +/* AIMdyn Inc., Santa Barbara, CA. */ +/* and supported by */ +/* - DARPA SBIR project "Koopman Operator-Based Forecasting */ +/* for Nonstationary Processes from Near-Term, Limited */ +/* Observational Data" Contract No: W31P4Q-21-C-0007 */ +/* - DARPA PAI project "Physics-Informed Machine Learning */ +/* Methodologies" Contract No: HR0011-18-9-0033 */ +/* - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */ +/* Framework for Space-Time Analysis of Process Dynamics" */ +/* Contract No: HR0011-16-C-0116 */ +/* Any opinions, findings and conclusions or recommendations */ +/* expressed in this material are those of the author and */ +/* do not necessarily reflect the views of the DARPA SBIR */ +/* Program Office. */ +/* ============================================================ */ +/* Distribution Statement A: */ +/* Approved for Public Release, Distribution Unlimited. */ +/* Cleared by DARPA on September 29, 2022 */ +/* ============================================================ */ +/* ...................................................................... */ +/* Arguments */ +/* ========= */ +/* JOBS (input) CHARACTER*1 */ +/* Determines whether the initial data snapshots are scaled */ +/* by a diagonal matrix. The data snapshots are the columns */ +/* of F. The leading N-1 columns of F are denoted X and the */ +/* trailing N-1 columns are denoted Y. */ +/* 'S' :: The data snapshots matrices X and Y are multiplied */ +/* with a diagonal matrix D so that X*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'C' :: The snapshots are scaled as with the 'S' option. */ +/* If it is found that an i-th column of X is zero */ +/* vector and the corresponding i-th column of Y is */ +/* non-zero, then the i-th column of Y is set to */ +/* zero and a warning flag is raised. */ +/* 'Y' :: The data snapshots matrices X and Y are multiplied */ +/* by a diagonal matrix D so that Y*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'N' :: No data scaling. */ +/* ..... */ +/* JOBZ (input) CHARACTER*1 */ +/* Determines whether the eigenvectors (Koopman modes) will */ +/* be computed. */ +/* 'V' :: The eigenvectors (Koopman modes) will be computed */ +/* and returned in the matrix Z. */ +/* See the description of Z. */ +/* 'F' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product Z*V, where Z */ +/* is orthonormal and V contains the eigenvectors */ +/* of the corresponding Rayleigh quotient. */ +/* See the descriptions of F, V, Z. */ +/* 'Q' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product Q*Z, where Z */ +/* contains the eigenvectors of the compression of the */ +/* underlying discretised operator onto the span of */ +/* the data snapshots. See the descriptions of F, V, Z. */ +/* Q is from the inital QR facorization. */ +/* 'N' :: The eigenvectors are not computed. */ +/* ..... */ +/* JOBR (input) CHARACTER*1 */ +/* Determines whether to compute the residuals. */ +/* 'R' :: The residuals for the computed eigenpairs will */ +/* be computed and stored in the array RES. */ +/* See the description of RES. */ +/* For this option to be legal, JOBZ must be 'V'. */ +/* 'N' :: The residuals are not computed. */ +/* ..... */ +/* JOBQ (input) CHARACTER*1 */ +/* Specifies whether to explicitly compute and return the */ +/* unitary matrix from the QR factorization. */ +/* 'Q' :: The matrix Q of the QR factorization of the data */ +/* snapshot matrix is computed and stored in the */ +/* array F. See the description of F. */ +/* 'N' :: The matrix Q is not explicitly computed. */ +/* ..... */ +/* JOBT (input) CHARACTER*1 */ +/* Specifies whether to return the upper triangular factor */ +/* from the QR factorization. */ +/* 'R' :: The matrix R of the QR factorization of the data */ +/* snapshot matrix F is returned in the array Y. */ +/* See the description of Y and Further details. */ +/* 'N' :: The matrix R is not returned. */ +/* ..... */ +/* JOBF (input) CHARACTER*1 */ +/* Specifies whether to store information needed for post- */ +/* processing (e.g. computing refined Ritz vectors) */ +/* 'R' :: The matrix needed for the refinement of the Ritz */ +/* vectors is computed and stored in the array B. */ +/* See the description of B. */ +/* 'E' :: The unscaled eigenvectors of the Exact DMD are */ +/* computed and returned in the array B. See the */ +/* description of B. */ +/* 'N' :: No eigenvector refinement data is computed. */ +/* To be useful on exit, this option needs JOBQ='Q'. */ +/* ..... */ +/* WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */ +/* Allows for a selection of the SVD algorithm from the */ +/* LAPACK library. */ +/* 1 :: CGESVD (the QR SVD algorithm) */ +/* 2 :: CGESDD (the Divide and Conquer algorithm; if enough */ +/* workspace available, this is the fastest option) */ +/* 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 */ +/* are the most accurate options) */ +/* 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 */ +/* are the most accurate options) */ +/* For the four methods above, a significant difference in */ +/* the accuracy of small singular values is possible if */ +/* the snapshots vary in norm so that X is severely */ +/* ill-conditioned. If small (smaller than EPS*||X||) */ +/* singular values are of interest and JOBS=='N', then */ +/* the options (3, 4) give the most accurate results, where */ +/* the option 4 is slightly better and with stronger */ +/* theoretical background. */ +/* If JOBS=='S', i.e. the columns of X will be normalized, */ +/* then all methods give nearly equally accurate results. */ +/* ..... */ +/* M (input) INTEGER, M >= 0 */ +/* The state space dimension (the number of rows of F). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshots from a single trajectory, */ +/* taken at equidistant discrete times. This is the */ +/* number of columns of F. */ +/* ..... */ +/* F (input/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, */ +/* the columns of F are the sequence of data snapshots */ +/* from a single trajectory, taken at equidistant discrete */ +/* times. It is assumed that the column norms of F are */ +/* in the range of the normalized floating point numbers. */ +/* < On exit, */ +/* If JOBQ == 'Q', the array F contains the orthogonal */ +/* matrix/factor of the QR factorization of the initial */ +/* data snapshots matrix F. See the description of JOBQ. */ +/* If JOBQ == 'N', the entries in F strictly below the main */ +/* diagonal contain, column-wise, the information on the */ +/* Householder vectors, as returned by CGEQRF. The */ +/* remaining information to restore the orthogonal matrix */ +/* of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). */ +/* See the description of ZWORK. */ +/* ..... */ +/* LDF (input) INTEGER, LDF >= M */ +/* The leading dimension of the array F. */ +/* ..... */ +/* X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* X is used as workspace to hold representations of the */ +/* leading N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, the leading K columns of X contain the leading */ +/* K left singular vectors of the above described content */ +/* of X. To lift them to the space of the left singular */ +/* vectors U(:,1:K) of the input data, pre-multiply with the */ +/* Q factor from the initial QR factorization. */ +/* See the descriptions of F, K, V and Z. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= N */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array */ +/* Y is used as workspace to hold representations of the */ +/* trailing N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, */ +/* If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */ +/* triangular factor from the QR factorization of the data */ +/* snapshot matrix F. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= N */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N-1 :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the description of K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the SVD/POD basis for the leading N-1 */ +/* data snapshots (columns of F) and the number of the */ +/* computed Ritz pairs. The value of K is determined */ +/* according to the rule set by the parameters NRNK and */ +/* TOL. See the descriptions of NRNK and TOL. */ +/* ..... */ +/* EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K<=N-1) entries of EIGS contain */ +/* the computed eigenvalues (Ritz values). */ +/* See the descriptions of K, and Z. */ +/* ..... */ +/* Z (workspace/output) COMPLEX(KIND=WP) M-by-(N-1) array */ +/* If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */ +/* is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */ +/* Z*V, where Z contains orthonormal matrix (the product of */ +/* Q from the initial QR factorization and the SVD/POD_basis */ +/* returned by CGEDMD in X) and the second factor (the */ +/* eigenvectors of the Rayleigh quotient) is in the array V, */ +/* as returned by CGEDMD. That is, X(:,1:K)*V(:,i) */ +/* is an eigenvector corresponding to EIGS(i). The columns */ +/* of V(1:K,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. */ +/* See the descriptions of EIGS, X and V. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs, */ +/* RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */ +/* See the description of EIGS and Z. */ +/* ..... */ +/* B (output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array. */ +/* IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:N,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* In both cases, the content of B can be lifted to the */ +/* original dimension of the input data by pre-multiplying */ +/* with the Q factor from the initial QR factorization. */ +/* Here A denotes a compression of the underlying operator. */ +/* See the descriptions of F and X. */ +/* If JOBF =='N', then B is not referenced. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= MIN(M,N) */ +/* The leading dimension of the array B. */ +/* ..... */ +/* V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */ +/* On exit, V(1:K,1:K) V contains the K eigenvectors of */ +/* the Rayleigh quotient. The Ritz vectors */ +/* (returned in Z) are the product of Q from the initial QR */ +/* factorization (see the description of F) X (see the */ +/* description of X) and V. */ +/* ..... */ +/* LDV (input) INTEGER, LDV >= N-1 */ +/* The leading dimension of the array V. */ +/* ..... */ +/* S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by CGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N-1 */ +/* The leading dimension of the array S. */ +/* ..... */ +/* ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* ZWORK(1:MIN(M,N)) contains the scalar factors of the */ +/* elementary reflectors as returned by CGEQRF of the */ +/* M-by-N input matrix F. */ +/* If the call to CGEDMDQ is only workspace query, then */ +/* ZWORK(1) contains the minimal complex workspace length and */ +/* ZWORK(2) is the optimal complex workspace length. */ +/* Hence, the length of work is at least 2. */ +/* See the description of LZWORK. */ +/* ..... */ +/* LZWORK (input) INTEGER */ +/* The minimal length of the workspace vector ZWORK. */ +/* LZWORK is calculated as follows: */ +/* Let MLWQR = N (minimal workspace for CGEQRF[M,N]) */ +/* MLWDMD = minimal workspace for CGEDMD (see the */ +/* description of LWORK in CGEDMD) */ +/* MLWMQR = N (minimal workspace for */ +/* ZUNMQR['L','N',M,N,N]) */ +/* MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) */ +/* MINMN = MIN(M,N) */ +/* Then */ +/* LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) */ +/* is further updated as follows: */ +/* if JOBZ == 'V' or JOBZ == 'F' THEN */ +/* LZWORK = MAX( LZWORK, MINMN+MLWMQR ) */ +/* if JOBQ == 'Q' THEN */ +/* LZWORK = MAX( ZLWORK, MINMN+MLWGQR) */ + +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* WORK(1:N-1) contains the singular values of */ +/* the input submatrix F(1:M,1:N-1). */ +/* If the call to CGEDMDQ is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. hence, the */ +/* length of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is the same as in CGEDMD, because in CGEDMDQ */ +/* only CGEDMD requires real workspace for snapshots */ +/* of dimensions MIN(M,N)-by-(N-1). */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* Let M1=MIN(M,N), N1=N-1. Then */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) */ + +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ + +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* External subroutines */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* .......................................................... */ + /* Parameter adjustments */ + f_dim1 = *ldf; + f_offset = 1 + f_dim1 * 1; + f -= f_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --eigs; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + v_dim1 = *ldv; + v_offset = 1 + v_dim1 * 1; + v -= v_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --zwork; + --work; + --iwork; + + /* Function Body */ + one = 1.f; + zero = 0.f; + zzero.r = 0.f, zzero.i = 0.f; + +/* Test the input arguments */ + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntvcf = lsame_(jobz, "F"); + wntvcq = lsame_(jobz, "Q"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + wantq = lsame_(jobq, "Q"); + wnttrf = lsame_(jobt, "R"); + minmn = f2cmin(*m,*n); + *info = 0; + lquery = *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && lsame_(jobz, "N")) { + *info = -3; + } else if (! (wantq || lsame_(jobq, "N"))) { + *info = -4; + } else if (! (wnttrf || lsame_(jobt, "N"))) { + *info = -5; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -6; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -7; + } else if (*m < 0) { + *info = -8; + } else if (*n < 0 || *n > *m + 1) { + *info = -9; + } else if (*ldf < *m) { + *info = -11; + } else if (*ldx < minmn) { + *info = -13; + } else if (*ldy < minmn) { + *info = -15; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -16; + } else if (*tol < zero || *tol >= one) { + *info = -17; + } else if (*ldz < *m) { + *info = -21; + } else if ((wntref || wntex) && *ldb < minmn) { + *info = -24; + } else if (*ldv < *n - 1) { + *info = -26; + } else if (*lds < *n - 1) { + *info = -28; + } + + if (wntvec || wntvcf || wntvcq) { + *(unsigned char *)jobvl = 'V'; + } else { + *(unsigned char *)jobvl = 'N'; + } + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0 || *n == 1) { +/* All output except K is void. INFO=1 signals */ +/* the void input. In case of a workspace query, */ +/* the minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + work[1] = 2.f; + work[2] = 2.f; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlrwrk = 2; + mlwork = 2; + olwork = 2; + iminwr = 1; + mlwqr = f2cmax(1,*n); +/* Minimal workspace length for CGEQRF. */ +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + cgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[1], &c_n1, & + info1); + olwqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwqr; + olwork = f2cmax(i__1,i__2); + } + i__1 = *n - 1; + cgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], + ldx, &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset] + , ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[ + s_offset], lds, &zwork[1], lzwork, &work[1], &c_n1, &iwork[1], + liwork, &info1); + mlwdmd = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwdmd; + mlwork = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = (integer) work[1]; + mlrwrk = f2cmax(i__1,i__2); + iminwr = f2cmax(iminwr,iwork[1]); + if (lquery) { + olwdmd = (integer) zwork[2].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwdmd; + olwork = f2cmax(i__1,i__2); + } + if (wntvec || wntvcf) { + mlwmqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwmqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + cunmqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &zwork[1], + &z__[z_offset], ldz, &zwork[1], &c_n1, &info1); + olwmqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwmqr; + olwork = f2cmax(i__1,i__2); + } + } + if (wantq) { + mlwgqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwgqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + cungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], & + zwork[1], &c_n1, &info1); + olwgqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwgqr; + olwork = f2cmax(i__1,i__2); + } + } + if (*liwork < iminwr && ! lquery) { + *info = -34; + } + if (*lwork < mlrwrk && ! lquery) { + *info = -32; + } + if (*lzwork < mlwork && ! lquery) { + *info = -30; + } + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGEDMDQ", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + zwork[1].r = (real) mlwork, zwork[1].i = 0.f; + zwork[2].r = (real) olwork, zwork[2].i = 0.f; + work[1] = (real) mlrwrk; + work[2] = (real) mlrwrk; + return 0; + } +/* ..... */ +/* Initial QR factorization that is used to represent the */ +/* snapshots as elements of lower dimensional subspace. */ +/* For large scale computation with M >>N , at this place */ +/* one can use an out of core QRF. */ + + i__1 = *lzwork - minmn; + cgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[minmn + 1], &i__1, & + info1); + +/* Define X and Y as the snapshots representations in the */ +/* orthogonal basis computed in the QR factorization. */ +/* X corresponds to the leading N-1 and Y to the trailing */ +/* N-1 snapshots. */ + i__1 = *n - 1; + claset_("L", &minmn, &i__1, &zzero, &zzero, &x[x_offset], ldx); + i__1 = *n - 1; + clacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx); + i__1 = *n - 1; + clacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy); + if (*m >= 3) { + i__1 = minmn - 2; + i__2 = *n - 2; + claset_("L", &i__1, &i__2, &zzero, &zzero, &y[y_dim1 + 3], ldy); + } + +/* Compute the DMD of the projected snapshot pairs (X,Y) */ + i__1 = *n - 1; + i__2 = *lzwork - minmn; + cgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx, + &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset], ldz, & + res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[s_offset], lds, & + zwork[minmn + 1], &i__2, &work[1], lwork, &iwork[1], liwork, & + info1); + if (info1 == 2 || info1 == 3) { +/* Return with error code. See CGEDMD for details. */ + *info = info1; + return 0; + } else { + *info = info1; + } + +/* The Ritz vectors (Koopman modes) can be explicitly */ +/* formed or returned in factored form. */ + if (wntvec) { +/* Compute the eigenvectors explicitly. */ + if (*m > minmn) { + i__1 = *m - minmn; + claset_("A", &i__1, k, &zzero, &zzero, &z__[minmn + 1 + z_dim1], + ldz); + } + i__1 = *lzwork - minmn; + cunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[ + z_offset], ldz, &zwork[minmn + 1], &i__1, &info1); + } else if (wntvcf) { +/* Return the Ritz vectors (eigenvectors) in factored */ +/* form Z*V, where Z contains orthonormal matrix (the */ +/* product of Q from the initial QR factorization and */ +/* the SVD/POD_basis returned by CGEDMD in X) and the */ +/* second factor (the eigenvectors of the Rayleigh */ +/* quotient) is in the array V, as returned by CGEDMD. */ + clacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz); + if (*m > *n) { + i__1 = *m - *n; + claset_("A", &i__1, k, &zzero, &zzero, &z__[*n + 1 + z_dim1], ldz); + } + i__1 = *lzwork - minmn; + cunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[ + z_offset], ldz, &zwork[minmn + 1], &i__1, &info1); + } + +/* Some optional output variables: */ + +/* The upper triangular factor R in the initial QR */ +/* factorization is optionally returned in the array Y. */ +/* This is useful if this call to CGEDMDQ is to be */ +/* followed by a streaming DMD that is implemented in a */ +/* QR compressed form. */ + if (wnttrf) { +/* Return the upper triangular R in Y */ + claset_("A", &minmn, n, &zzero, &zzero, &y[y_offset], ldy); + clacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy); + } + +/* The orthonormal/unitary factor Q in the initial QR */ +/* factorization is optionally returned in the array F. */ +/* Same as with the triangular factor above, this is */ +/* useful in a streaming DMD. */ + if (wantq) { +/* Q overwrites F */ + i__1 = *lzwork - minmn; + cungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &zwork[minmn + + 1], &i__1, &info1); + } + + return 0; + +} /* cgedmdq_ */ + diff --git a/lapack-netlib/SRC/dgedmd.c b/lapack-netlib/SRC/dgedmd.c index 447b23014..66b4d5da6 100644 --- a/lapack-netlib/SRC/dgedmd.c +++ b/lapack-netlib/SRC/dgedmd.c @@ -509,3 +509,1245 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static integer c_n1 = -1; +static integer c__1 = 1; +static integer c__0 = 0; +static integer c__2 = 2; + +/* Subroutine */ int dgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, + integer *whtsvd, integer *m, integer *n, doublereal *x, integer *ldx, + doublereal *y, integer *ldy, integer *nrnk, doublereal *tol, integer * + k, doublereal *reig, doublereal *imeig, doublereal *z__, integer *ldz, + doublereal *res, doublereal *b, integer *ldb, doublereal *w, integer + *ldw, doublereal *s, integer *lds, doublereal *work, integer *lwork, + integer *iwork, integer *liwork, integer *info) +{ + /* System generated locals */ + integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, + b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2; + doublereal d__1, d__2; + + /* Local variables */ + doublereal zero, ssum; + integer info1, info2; + extern doublereal dnrm2_(integer *, doublereal *, integer *); + doublereal xscl1, xscl2; + integer i__, j; + extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + integer *); + doublereal scale; + extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *), + dgeev_(char *, char *, integer *, doublereal *, integer *, + doublereal *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, integer *, integer *); + extern logical lsame_(char *, char *); + logical badxy; + doublereal small; + char jobzl[1]; + extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + integer *, doublereal *, integer *); + logical wntex; + doublereal ab[4] /* was [2][2] */; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + extern /* Subroutine */ int dgesdd_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *, doublereal *, integer *, integer *, + integer *), dlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublereal *, + integer *, integer *); + extern integer idamax_(integer *, doublereal *, integer *); + extern logical disnan_(doublereal *); + extern /* Subroutine */ int dgesvd_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal + *, integer *, doublereal *, integer *), xerbla_(char *, + integer *); + char t_or_n__[1]; + extern /* Subroutine */ int dgejsv_(char *, char *, char *, char *, char * + , char *, integer *, integer *, doublereal *, integer *, + doublereal *, doublereal *, integer *, doublereal *, integer *, + doublereal *, integer *, integer *, integer *), dlassq_(integer *, doublereal *, + integer *, doublereal *, doublereal *); + logical sccolx, sccoly; + integer lwrsdd, mwrsdd, iminwr; + logical wntref, wntvec; + doublereal rootsc; + integer lwrkev, mlwork, mwrkev, numrnk, olwork; + doublereal rdummy[2]; + integer lwrsvd, mwrsvd; + logical lquery, wntres; + char jsvopt[1]; + extern /* Subroutine */ int mecago_(); + integer mwrsvj, lwrsvq, mwrsvq; + doublereal rdummy2[2], ofl, one; + extern /* Subroutine */ int dgesvdq_(char *, char *, char *, char *, char + *, integer *, integer *, doublereal *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, integer *, + integer *, integer *, doublereal *, integer *, doublereal *, + integer *, integer *); + +/* March 2023 */ +/* ..... */ +/* USE iso_fortran_env */ +/* INTEGER, PARAMETER :: WP = real64 */ +/* ..... */ +/* Scalar arguments */ +/* Array arguments */ +/* ............................................................ */ +/* Purpose */ +/* ======= */ +/* DGEDMD computes the Dynamic Mode Decomposition (DMD) for */ +/* a pair of data snapshot matrices. For the input matrices */ +/* X and Y such that Y = A*X with an unaccessible matrix */ +/* A, DGEDMD computes a certain number of Ritz pairs of A using */ +/* the standard Rayleigh-Ritz extraction from a subspace of */ +/* range(X) that is determined using the leading left singular */ +/* vectors of X. Optionally, DGEDMD returns the residuals */ +/* of the computed Ritz pairs, the information needed for */ +/* a refinement of the Ritz vectors, or the eigenvectors of */ +/* the Exact DMD. */ +/* For further details see the references listed */ +/* below. For more details of the implementation see [3]. */ + +/* References */ +/* ========== */ +/* [1] P. Schmid: Dynamic mode decomposition of numerical */ +/* and experimental data, */ +/* Journal of Fluid Mechanics 656, 5-28, 2010. */ +/* [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */ +/* decompositions: analysis and enhancements, */ +/* SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */ +/* [3] Z. Drmac: A LAPACK implementation of the Dynamic */ +/* Mode Decomposition I. Technical report. AIMDyn Inc. */ +/* and LAPACK Working Note 298. */ +/* [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */ +/* Brunton, N. Kutz: On Dynamic Mode Decomposition: */ +/* Theory and Applications, Journal of Computational */ +/* Dynamics 1(2), 391 -421, 2014. */ + +/* ...................................................................... */ +/* Developed and supported by: */ +/* =========================== */ +/* Developed and coded by Zlatko Drmac, Faculty of Science, */ +/* University of Zagreb; drmac@math.hr */ +/* In cooperation with */ +/* AIMdyn Inc., Santa Barbara, CA. */ +/* and supported by */ +/* - DARPA SBIR project "Koopman Operator-Based Forecasting */ +/* for Nonstationary Processes from Near-Term, Limited */ +/* Observational Data" Contract No: W31P4Q-21-C-0007 */ +/* - DARPA PAI project "Physics-Informed Machine Learning */ +/* Methodologies" Contract No: HR0011-18-9-0033 */ +/* - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */ +/* Framework for Space-Time Analysis of Process Dynamics" */ +/* Contract No: HR0011-16-C-0116 */ +/* Any opinions, findings and conclusions or recommendations */ +/* expressed in this material are those of the author and */ +/* do not necessarily reflect the views of the DARPA SBIR */ +/* Program Office */ +/* ============================================================ */ +/* Distribution Statement A: */ +/* Approved for Public Release, Distribution Unlimited. */ +/* Cleared by DARPA on September 29, 2022 */ +/* ============================================================ */ +/* ............................................................ */ +/* Arguments */ +/* ========= */ +/* JOBS (input) CHARACTER*1 */ +/* Determines whether the initial data snapshots are scaled */ +/* by a diagonal matrix. */ +/* 'S' :: The data snapshots matrices X and Y are multiplied */ +/* with a diagonal matrix D so that X*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'C' :: The snapshots are scaled as with the 'S' option. */ +/* If it is found that an i-th column of X is zero */ +/* vector and the corresponding i-th column of Y is */ +/* non-zero, then the i-th column of Y is set to */ +/* zero and a warning flag is raised. */ +/* 'Y' :: The data snapshots matrices X and Y are multiplied */ +/* by a diagonal matrix D so that Y*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'N' :: No data scaling. */ +/* ..... */ +/* JOBZ (input) CHARACTER*1 */ +/* Determines whether the eigenvectors (Koopman modes) will */ +/* be computed. */ +/* 'V' :: The eigenvectors (Koopman modes) will be computed */ +/* and returned in the matrix Z. */ +/* See the description of Z. */ +/* 'F' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product X(:,1:K)*W, where X */ +/* contains a POD basis (leading left singular vectors */ +/* of the data matrix X) and W contains the eigenvectors */ +/* of the corresponding Rayleigh quotient. */ +/* See the descriptions of K, X, W, Z. */ +/* 'N' :: The eigenvectors are not computed. */ +/* ..... */ +/* JOBR (input) CHARACTER*1 */ +/* Determines whether to compute the residuals. */ +/* 'R' :: The residuals for the computed eigenpairs will be */ +/* computed and stored in the array RES. */ +/* See the description of RES. */ +/* For this option to be legal, JOBZ must be 'V'. */ +/* 'N' :: The residuals are not computed. */ +/* ..... */ +/* JOBF (input) CHARACTER*1 */ +/* Specifies whether to store information needed for post- */ +/* processing (e.g. computing refined Ritz vectors) */ +/* 'R' :: The matrix needed for the refinement of the Ritz */ +/* vectors is computed and stored in the array B. */ +/* See the description of B. */ +/* 'E' :: The unscaled eigenvectors of the Exact DMD are */ +/* computed and returned in the array B. See the */ +/* description of B. */ +/* 'N' :: No eigenvector refinement data is computed. */ +/* ..... */ +/* WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */ +/* Allows for a selection of the SVD algorithm from the */ +/* LAPACK library. */ +/* 1 :: DGESVD (the QR SVD algorithm) */ +/* 2 :: DGESDD (the Divide and Conquer algorithm; if enough */ +/* workspace available, this is the fastest option) */ +/* 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 */ +/* are the most accurate options) */ +/* 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 */ +/* are the most accurate options) */ +/* For the four methods above, a significant difference in */ +/* the accuracy of small singular values is possible if */ +/* the snapshots vary in norm so that X is severely */ +/* ill-conditioned. If small (smaller than EPS*||X||) */ +/* singular values are of interest and JOBS=='N', then */ +/* the options (3, 4) give the most accurate results, where */ +/* the option 4 is slightly better and with stronger */ +/* theoretical background. */ +/* If JOBS=='S', i.e. the columns of X will be normalized, */ +/* then all methods give nearly equally accurate results. */ +/* ..... */ +/* M (input) INTEGER, M>= 0 */ +/* The state space dimension (the row dimension of X, Y). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshot pairs */ +/* (the number of columns of X and Y). */ +/* ..... */ +/* X (input/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, X contains the data snapshot matrix X. It is */ +/* assumed that the column norms of X are in the range of */ +/* the normalized floating point numbers. */ +/* < On exit, the leading K columns of X contain a POD basis, */ +/* i.e. the leading K left singular vectors of the input */ +/* data matrix X, U(:,1:K). All N columns of X contain all */ +/* left singular vectors of the input matrix X. */ +/* See the descriptions of K, Z and W. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= M */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (input/workspace/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, Y contains the data snapshot matrix Y */ +/* < On exit, */ +/* If JOBR == 'R', the leading K columns of Y contain */ +/* the residual vectors for the computed Ritz pairs. */ +/* See the description of RES. */ +/* If JOBR == 'N', Y contains the original input data, */ +/* scaled according to the value of JOBS. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= M */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1). */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ + +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the descriptions of TOL and K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the POD basis for the data snapshot */ +/* matrix X and the number of the computed Ritz pairs. */ +/* The value of K is determined according to the rule set */ +/* by the parameters NRNK and TOL. */ +/* See the descriptions of NRNK and TOL. */ +/* ..... */ +/* REIG (output) REAL(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of REIG contain */ +/* the real parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* See the descriptions of K, IMEIG, and Z. */ +/* ..... */ +/* IMEIG (output) REAL(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of IMEIG contain */ +/* the imaginary parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* The eigenvalues are determined as follows: */ +/* If IMEIG(i) == 0, then the corresponding eigenvalue is */ +/* real, LAMBDA(i) = REIG(i). */ +/* If IMEIG(i)>0, then the corresponding complex */ +/* conjugate pair of eigenvalues reads */ +/* LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) */ +/* LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */ +/* That is, complex conjugate pairs have consecutive */ +/* indices (i,i+1), with the positive imaginary part */ +/* listed first. */ +/* See the descriptions of K, REIG, and Z. */ +/* ..... */ +/* Z (workspace/output) REAL(KIND=WP) M-by-N array */ +/* If JOBZ =='V' then */ +/* Z contains real Ritz vectors as follows: */ +/* If IMEIG(i)=0, then Z(:,i) is an eigenvector of */ +/* the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */ +/* [Z(:,i) Z(:,i+1)] span an invariant subspace and */ +/* the Ritz values extracted from this subspace are */ +/* REIG(i) + sqrt(-1)*IMEIG(i) and */ +/* REIG(i) - sqrt(-1)*IMEIG(i). */ +/* The corresponding eigenvectors are */ +/* Z(:,i) + sqrt(-1)*Z(:,i+1) and */ +/* Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */ +/* || Z(:,i:i+1)||_F = 1. */ +/* If JOBZ == 'F', then the above descriptions hold for */ +/* the columns of X(:,1:K)*W(1:K,1:K), where the columns */ +/* of W(1:k,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. The columns of W(1:K,1:K) */ +/* are similarly structured: If IMEIG(i) == 0 then */ +/* X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 */ +/* then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and */ +/* X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) */ +/* are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */ +/* See the descriptions of REIG, IMEIG, X and W. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) N-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs. */ +/* If LAMBDA(i) is real, then */ +/* RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */ +/* If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */ +/* then */ +/* RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */ +/* where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */ +/* [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */ +/* It holds that */ +/* RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 */ +/* RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */ +/* where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) */ +/* ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) */ +/* See the description of REIG, IMEIG and Z. */ +/* ..... */ +/* B (output) REAL(KIND=WP) M-by-N array. */ +/* IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:M,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* If JOBF =='N', then B is not referenced. */ +/* See the descriptions of X, W, K. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= M */ +/* The leading dimension of the array B. */ +/* ..... */ +/* W (workspace/output) REAL(KIND=WP) N-by-N array */ +/* On exit, W(1:K,1:K) contains the K computed */ +/* eigenvectors of the matrix Rayleigh quotient (real and */ +/* imaginary parts for each complex conjugate pair of the */ +/* eigenvalues). The Ritz vectors (returned in Z) are the */ +/* product of X (containing a POD basis for the input */ +/* matrix X) and W. See the descriptions of K, S, X and Z. */ +/* W is also used as a workspace to temporarily store the */ +/* right singular vectors of X. */ +/* ..... */ +/* LDW (input) INTEGER, LDW >= N */ +/* The leading dimension of the array W. */ +/* ..... */ +/* S (workspace/output) REAL(KIND=WP) N-by-N array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by DGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N */ +/* The leading dimension of the array S. */ +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, WORK(1:N) contains the singular values of */ +/* X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */ +/* If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain */ +/* scaling factor WORK(N+2)/WORK(N+1) used to scale X */ +/* and Y to avoid overflow in the SVD of X. */ +/* This may be of interest if the scaling option is off */ +/* and as many as possible smallest eigenvalues are */ +/* desired to the highest feasible accuracy. */ +/* If the call to DGEDMD is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. Hence, the */ +/* leng of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is calculated as follows: */ +/* If WHTSVD == 1 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). */ +/* If JOBZ == 'N' then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). */ +/* Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal */ +/* workspace length of DGESVD. */ +/* If WHTSVD == 2 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) */ +/* Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the */ +/* minimal workspace length of DGESDD. */ +/* If WHTSVD == 3 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */ +/* Here LWORK_SVD = N+M+MAX(3*N+1, */ +/* MAX(1,3*N+M,5*N),MAX(1,N)) */ +/* is the minimal workspace length of DGESVDQ. */ +/* If WHTSVD == 4 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */ +/* Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the */ +/* minimal workspace length of DGEJSV. */ +/* The above expressions are not simplified in order to */ +/* make the usage of WORK more transparent, and for */ +/* easier checking. In any case, LWORK >= 2. */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ +/* Local arrays */ +/* ~~~~~~~~~~~~ */ +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* ............................................................ */ + /* Parameter adjustments */ + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --reig; + --imeig; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + w_dim1 = *ldw; + w_offset = 1 + w_dim1 * 1; + w -= w_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --work; + --iwork; + + /* Function Body */ + one = 1.f; + zero = 0.f; + +/* Test the input arguments */ + + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + *info = 0; + lquery = *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || lsame_(jobz, "N") || lsame_( + jobz, "F"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && ! wntvec) { + *info = -3; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -4; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -5; + } else if (*m < 0) { + *info = -6; + } else if (*n < 0 || *n > *m) { + *info = -7; + } else if (*ldx < *m) { + *info = -9; + } else if (*ldy < *m) { + *info = -11; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -12; + } else if (*tol < zero || *tol >= one) { + *info = -13; + } else if (*ldz < *m) { + *info = -18; + } else if ((wntref || wntex) && *ldb < *m) { + *info = -21; + } else if (*ldw < *n) { + *info = -23; + } else if (*lds < *n) { + *info = -25; + } + + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0) { +/* Quick return. All output except K is void. */ +/* INFO=1 signals the void input. */ +/* In case of a workspace query, the default */ +/* minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + work[1] = 2.; + work[2] = 2.; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlwork = f2cmax(2,*n); + olwork = f2cmax(2,*n); + iminwr = 1; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of DGESVD: */ +/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */ +/* Computing MAX */ + i__1 = 1, i__2 = f2cmin(*m,*n) * 3 + f2cmax(*m,*n), i__1 = f2cmax(i__1, + i__2), i__2 = f2cmin(*m,*n) * 5; + mwrsvd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvd; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + dgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[ + b_offset], ldb, &w[w_offset], ldw, rdummy, &c_n1, & + info1); +/* Computing MAX */ + i__1 = mwrsvd, i__2 = (integer) rdummy[0]; + lwrsvd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsvd; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 2) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of DGESDD: */ +/* MWRSDD = 3*MIN(M,N)*MIN(M,N) + */ +/* MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) */ +/* IMINWR = 8*MIN(M,N) */ +/* Computing MAX */ + i__1 = f2cmax(*m,*n), i__2 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + (f2cmin(*m,* + n) << 2); + mwrsdd = f2cmin(*m,*n) * 3 * f2cmin(*m,*n) + f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsdd; + mlwork = f2cmax(i__1,i__2); + iminwr = f2cmin(*m,*n) << 3; + if (lquery) { + dgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], + ldb, &w[w_offset], ldw, rdummy, &c_n1, &iwork[1], & + info1); +/* Computing MAX */ + i__1 = mwrsdd, i__2 = (integer) rdummy[0]; + lwrsdd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsdd; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 3) { +/* LWQP3 = 3*N+1 */ +/* LWORQ = MAX(N, 1) */ +/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */ +/* MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ ) + MAX(M,2) */ +/* MLWORK = N + MWRSVQ */ +/* IMINWR = M+N-1 */ + dgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[ + 1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, & + iwork[1], liwork, rdummy, &c_n1, rdummy2, &c_n1, &info1); + iminwr = iwork[1]; + mwrsvq = (integer) rdummy[1]; +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvq + (integer) rdummy2[0]; + mlwork = f2cmax(i__1,i__2); + if (lquery) { +/* Computing MAX */ + i__1 = mwrsvq, i__2 = (integer) rdummy[0]; + lwrsvq = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsvq + (integer) rdummy2[0]; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 4) { + *(unsigned char *)jsvopt = 'J'; +/* MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N ) ! for JSVOPT='V' */ +/* Computing MAX */ + i__1 = 7, i__2 = (*m << 1) + *n, i__1 = f2cmax(i__1,i__2), i__2 = (* + n << 2) + *n * *n, i__1 = f2cmax(i__1,i__2), i__2 = (*n << 1) + + *n * *n + 6; + mwrsvj = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvj; + mlwork = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 3, i__2 = *m + *n * 3; + iminwr = f2cmax(i__1,i__2); + if (lquery) { +/* Computing MAX */ + i__1 = olwork, i__2 = *n + mwrsvj; + olwork = f2cmax(i__1,i__2); + } +/* END SELECT */ + } + if (wntvec || wntex || lsame_(jobz, "F")) { + *(unsigned char *)jobzl = 'V'; + } else { + *(unsigned char *)jobzl = 'N'; + } +/* Workspace calculation to the DGEEV call */ + if (lsame_(jobzl, "V")) { +/* Computing MAX */ + i__1 = 1, i__2 = *n << 2; + mwrkev = f2cmax(i__1,i__2); + } else { +/* Computing MAX */ + i__1 = 1, i__2 = *n * 3; + mwrkev = f2cmax(i__1,i__2); + } +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrkev; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + dgeev_("N", jobzl, n, &s[s_offset], lds, &reig[1], &imeig[1], &w[ + w_offset], ldw, &w[w_offset], ldw, rdummy, &c_n1, &info1); +/* Computing MAX */ + i__1 = mwrkev, i__2 = (integer) rdummy[0]; + lwrkev = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrkev; + olwork = f2cmax(i__1,i__2); + } + + if (*liwork < iminwr && ! lquery) { + *info = -29; + } + if (*lwork < mlwork && ! lquery) { + *info = -27; + } + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGEDMD", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + work[1] = (doublereal) mlwork; + work[2] = (doublereal) olwork; + return 0; + } +/* ............................................................ */ + + ofl = dlamch_("O"); + small = dlamch_("S"); + badxy = FALSE_; + +/* <1> Optional scaling of the snapshots (columns of X, Y) */ +/* ========================================================== */ + if (sccolx) { +/* The columns of X will be normalized. */ +/* To prevent overflows, the column norms of X are */ +/* carefully computed using DLASSQ. */ + *k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = DNRM2( M, X(1,i), 1 ) */ + scale = zero; + dlassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum); + if (disnan_(&scale) || disnan_(&ssum)) { + *k = 0; + *info = -8; + i__2 = -(*info); + xerbla_("DGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of X(:,i) overflows. First, X(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */ +/* Next, the norm of X(:,i) is stored without */ +/* overflow as WORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of X(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + d__1 = one / rootsc; + dlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &x[ + i__ * x_dim1 + 1], m, &info2); + work[i__] = -scale * (rootsc / (doublereal) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + work[i__] = scale * rootsc; + dlascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &x[ + i__ * x_dim1 + 1], m, &info2); +/* X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC */ +/* LAPACK */ + } + } else { + work[i__] = zero; + ++(*k); + } + } + if (*k == *n) { +/* All columns of X are zero. Return error code -8. */ +/* (the 8th input variable had an illegal value) */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("DGEDMD", &i__1); + return 0; + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of Y. */ + if (work[i__] > zero) { + d__1 = one / work[i__]; + dscal_(m, &d__1, &y[i__ * y_dim1 + 1], &c__1); +/* Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (work[i__] < zero) { + d__1 = -work[i__]; + d__2 = one / (doublereal) (*m); + dlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &y[i__ * + y_dim1 + 1], m, &info2); +/* LAPACK CAL */ + } else if (y[idamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ * + y_dim1] != zero) { +/* X(:,i) is zero vector. For consistency, */ +/* Y(:,i) should also be zero. If Y(:,i) is not */ +/* zero, then the data might be inconsistent or */ +/* corrupted. If JOBS == 'C', Y(:,i) is set to */ +/* zero and a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + if (lsame_(jobs, "C")) { + dscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1); + } +/* BLAS CALL */ + } + } + } + + if (sccoly) { +/* The columns of Y will be normalized. */ +/* To prevent overflows, the column norms of Y are */ +/* carefully computed using DLASSQ. */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = DNRM2( M, Y(1,i), 1 ) */ + scale = zero; + dlassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum); + if (disnan_(&scale) || disnan_(&ssum)) { + *k = 0; + *info = -10; + i__2 = -(*info); + xerbla_("DGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of Y(:,i) overflows. First, Y(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */ +/* Next, the norm of Y(:,i) is stored without */ +/* overflow as WORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of Y(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + d__1 = one / rootsc; + dlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &y[ + i__ * y_dim1 + 1], m, &info2); + work[i__] = -scale * (rootsc / (doublereal) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + work[i__] = scale * rootsc; + dlascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &y[ + i__ * y_dim1 + 1], m, &info2); +/* Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* LAPACK */ + } + } else { + work[i__] = zero; + } + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of X. */ + if (work[i__] > zero) { + d__1 = one / work[i__]; + dscal_(m, &d__1, &x[i__ * x_dim1 + 1], &c__1); +/* X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (work[i__] < zero) { + d__1 = -work[i__]; + d__2 = one / (doublereal) (*m); + dlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &x[i__ * + x_dim1 + 1], m, &info2); +/* LAPACK CAL */ + } else if (x[idamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ * + x_dim1] != zero) { +/* Y(:,i) is zero vector. If X(:,i) is not */ +/* zero, then a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + } + } + } + +/* <2> SVD of the data snapshot matrix X. */ +/* ===================================== */ +/* The left singular vectors are stored in the array X. */ +/* The right singular vectors are in the array W. */ +/* The array W will later on contain the eigenvectors */ +/* of a Rayleigh quotient. */ + numrnk = *n; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { + i__1 = *lwork - *n; + dgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], + ldb, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1); +/* LAPACK CAL */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 2) { + i__1 = *lwork - *n; + dgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], ldb, &w[ + w_offset], ldw, &work[*n + 1], &i__1, &iwork[1], &info1); +/* LAPACK CAL */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 3) { + i__1 = *lwork - *n - f2cmax(2,*m); + i__2 = f2cmax(2,*m); + dgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[1], & + z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], + liwork, &work[*n + f2cmax(2,*m) + 1], &i__1, &work[*n + 1], & + i__2, &info1); +/* L */ + dlacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK C */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 4) { + i__1 = *lwork - *n; + dgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, & + work[1], &z__[z_offset], ldz, &w[w_offset], ldw, &work[*n + 1] + , &i__1, &iwork[1], &info1); +/* LAPACK CALL */ + dlacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK CALL */ + *(unsigned char *)t_or_n__ = 'N'; + xscl1 = work[*n + 1]; + xscl2 = work[*n + 2]; + if (xscl1 != xscl2) { +/* This is an exceptional situation. If the */ +/* data matrices are not scaled and the */ +/* largest singular value of X overflows. */ +/* In that case DGEJSV can return the SVD */ +/* in scaled form. The scaling factor can be used */ +/* to rescale the data (X and Y). */ + dlascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], + ldy, &info2); + } +/* END SELECT */ + } + + if (info1 > 0) { +/* The SVD selected subroutine did not converge. */ +/* Return with an error code. */ + *info = 2; + return 0; + } + + if (work[1] == zero) { +/* The largest computed singular value of (scaled) */ +/* X is zero. Return error code -8 */ +/* (the 8th input variable had an illegal value). */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("DGEDMD", &i__1); + return 0; + } + +/* <3> Determine the numerical rank of the data */ +/* snapshots matrix X. This depends on the */ +/* parameters NRNK and TOL. */ +/* SELECT CASE ( NRNK ) */ + if (*nrnk == -1) { + *k = 1; + i__1 = numrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (work[i__] <= work[1] * *tol || work[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else if (*nrnk == -2) { + *k = 1; + i__1 = numrnk - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + if (work[i__ + 1] <= work[i__] * *tol || work[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else { + *k = 1; + i__1 = *nrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (work[i__] <= small) { + myexit_(); + } + ++(*k); + } +/* END SELECT */ + } +/* Now, U = X(1:M,1:K) is the SVD/POD basis for the */ +/* snapshot data in the input matrix X. */ +/* <4> Compute the Rayleigh quotient S = U^T * A * U. */ +/* Depending on the requested outputs, the computation */ +/* is organized to compute additional auxiliary */ +/* matrices (for the residuals and refinements). */ + +/* In all formulas below, we need V_k*Sigma_k^(-1) */ +/* where either V_k is in W(1:N,1:K), or V_k^T is in */ +/* W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */ + if (lsame_(t_or_n__, "N")) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + d__1 = one / work[i__]; + dscal_(n, &d__1, &w[i__ * w_dim1 + 1], &c__1); +/* W(1:N,i) = (ONE/WORK(i)) * W(1:N,i) ! INTRINSIC */ +/* BLAS CALL */ + } + } else { +/* This non-unit stride access is due to the fact */ +/* that DGESVD, DGESVDQ and DGESDD return the */ +/* transposed matrix of the right singular vectors. */ +/* DO i = 1, K */ +/* CALL DSCAL( N, ONE/WORK(i), W(i,1), LDW ) ! BLAS CALL */ +/* ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N) ! INTRINSIC */ +/* END DO */ + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + work[*n + i__] = one / work[i__]; + } + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *k; + for (i__ = 1; i__ <= i__2; ++i__) { + w[i__ + j * w_dim1] = work[*n + i__] * w[i__ + j * w_dim1]; + } + } + } + + if (wntref) { + +/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */ +/* for computing the refined Ritz vectors */ +/* (optionally, outside DGEDMD). */ + dgemm_("N", t_or_n__, m, k, n, &one, &y[y_offset], ldy, &w[w_offset], + ldw, &zero, &z__[z_offset], ldz); +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRI */ +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRI */ + +/* At this point Z contains */ +/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */ +/* this is needed for computing the residuals. */ +/* This matrix is returned in the array B and */ +/* it can be used to compute refined Ritz vectors. */ +/* BLAS */ + dlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); +/* B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC */ +/* BLAS CALL */ + dgemm_("T", "N", k, k, m, &one, &x[x_offset], ldx, &z__[z_offset], + ldz, &zero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */ +/* At this point S = U^T * A * U is the Rayleigh quotient. */ +/* BLAS */ + } else { +/* A * U(:,1:K) is not explicitly needed and the */ +/* computation is organized differently. The Rayleigh */ +/* quotient is computed more efficiently. */ + dgemm_("T", "N", k, n, m, &one, &x[x_offset], ldx, &y[y_offset], ldy, + &zero, &z__[z_offset], ldz); +/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! IN */ +/* In the two DGEMM calls here, can use K for LDZ. */ +/* B */ + dgemm_("N", t_or_n__, k, k, n, &one, &z__[z_offset], ldz, &w[w_offset] + , ldw, &zero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */ +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRIN */ +/* At this point S = U^T * A * U is the Rayleigh quotient. */ +/* If the residuals are requested, save scaled V_k into Z. */ +/* Recall that V_k or V_k^T is stored in W. */ +/* BLAS */ + if (wntres || wntex) { + if (lsame_(t_or_n__, "N")) { + dlacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz); + } else { + dlacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz); + } + } + } + +/* <5> Compute the Ritz values and (if requested) the */ +/* right eigenvectors of the Rayleigh quotient. */ + + i__1 = *lwork - *n; + dgeev_("N", jobzl, k, &s[s_offset], lds, &reig[1], &imeig[1], &w[w_offset] + , ldw, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1); + +/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */ +/* quotient. Even in the case of complex spectrum, all */ +/* computation is done in real arithmetic. REIG and */ +/* IMEIG are the real and the imaginary parts of the */ +/* eigenvalues, so that the spectrum is given as */ +/* REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs */ +/* are listed at consecutive positions. For such a */ +/* complex conjugate pair of the eigenvalues, the */ +/* corresponding eigenvectors are also a complex */ +/* conjugate pair with the real and imaginary parts */ +/* stored column-wise in W at the corresponding */ +/* consecutive column indices. See the description of Z. */ +/* Also, see the description of DGEEV. */ +/* LAPACK C */ + if (info1 > 0) { +/* DGEEV failed to compute the eigenvalues and */ +/* eigenvectors of the Rayleigh quotient. */ + *info = 3; + return 0; + } + +/* <6> Compute the eigenvectors (if requested) and, */ +/* the residuals (if requested). */ + + if (wntvec || wntex) { + if (wntres) { + if (wntref) { +/* Here, if the refinement is requested, we have */ +/* A*U(:,1:K) already computed and stored in Z. */ +/* For the residuals, need Y = A * U(:,1;K) * W. */ + dgemm_("N", "N", m, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &y[y_offset], ldy); +/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC */ +/* This frees Z; Y contains A * U(:,1:K) * W. */ +/* BLAS CALL */ + } else { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + dgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + dgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[ + s_offset], lds, &zero, &z__[z_offset], ldz); +/* Save a copy of Z into Y and free Z for holding */ +/* the Ritz vectors. */ + dlacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy); + if (wntex) { + dlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); + } + } + } else if (wntex) { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + dgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + dgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[s_offset], + lds, &zero, &b[b_offset], ldb); +/* The above call replaces the following two calls */ +/* that were used in the developing-testing phase. */ +/* CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & */ +/* LDS, ZERO, Z, LDZ) */ +/* Save a copy of Z into B and free Z for holding */ +/* the Ritz vectors. */ +/* CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) */ + } + +/* Compute the real form of the Ritz vectors */ + if (wntvec) { + dgemm_("N", "N", m, k, k, &one, &x[x_offset], ldx, &w[w_offset], + ldw, &zero, &z__[z_offset], ldz); + } +/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC */ + +/* BLAS CALL */ + if (wntres) { + i__ = 1; + while(i__ <= *k) { + if (imeig[i__] == zero) { +/* have a real eigenvalue with real eigenvector */ + d__1 = -reig[i__]; + daxpy_(m, &d__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * + y_dim1 + 1], &c__1); +/* Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i) ! */ + + res[i__] = dnrm2_(m, &y[i__ * y_dim1 + 1], &c__1); + + ++i__; + } else { +/* Have a complex conjugate pair */ +/* REIG(i) +- sqrt(-1)*IMEIG(i). */ +/* Since all computation is done in real */ +/* arithmetic, the formula for the residual */ +/* is recast for real representation of the */ +/* complex conjugate eigenpair. See the */ +/* description of RES. */ + ab[0] = reig[i__]; + ab[1] = -imeig[i__]; + ab[2] = imeig[i__]; + ab[3] = reig[i__]; + d__1 = -one; + dgemm_("N", "N", m, &c__2, &c__2, &d__1, &z__[i__ * + z_dim1 + 1], ldz, ab, &c__2, &one, &y[i__ * + y_dim1 + 1], ldy); +/* Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INT */ +/* BL */ + res[i__] = dlange_("F", m, &c__2, &y[i__ * y_dim1 + 1], + ldy, &work[*n + 1]); +/* LA */ + res[i__ + 1] = res[i__]; + i__ += 2; + } + } + } + } + + if (*whtsvd == 4) { + work[*n + 1] = xscl1; + work[*n + 2] = xscl2; + } + +/* Successful exit. */ + if (! badxy) { + *info = 0; + } else { +/* A warning on possible data inconsistency. */ +/* This should be a rare event. */ + *info = 4; + } +/* ............................................................ */ + return 0; +/* ...... */ +} /* dgedmd_ */ + diff --git a/lapack-netlib/SRC/dgedmdq.c b/lapack-netlib/SRC/dgedmdq.c index 447b23014..a743a3156 100644 --- a/lapack-netlib/SRC/dgedmdq.c +++ b/lapack-netlib/SRC/dgedmdq.c @@ -509,3 +509,792 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static integer c_n1 = -1; + +/* Subroutine */ int dgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, + char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, + doublereal *f, integer *ldf, doublereal *x, integer *ldx, doublereal * + y, integer *ldy, integer *nrnk, doublereal *tol, integer *k, + doublereal *reig, doublereal *imeig, doublereal *z__, integer *ldz, + doublereal *res, doublereal *b, integer *ldb, doublereal *v, integer * + ldv, doublereal *s, integer *lds, doublereal *work, integer *lwork, + integer *iwork, integer *liwork, integer *info) +{ + /* System generated locals */ + integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, + z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, + i__1, i__2; + + /* Local variables */ + doublereal zero; + integer info1; + extern logical lsame_(char *, char *); + char jobvl[1]; + integer minmn; + logical wantq; + integer mlwqr, olwqr; + logical wntex; + extern /* Subroutine */ int dgedmd_(char *, char *, char *, char *, + integer *, integer *, integer *, doublereal *, integer *, + doublereal *, integer *, integer *, doublereal *, integer *, + doublereal *, doublereal *, doublereal *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *, doublereal *, integer *, integer *, integer *, integer + *), dgeqrf_(integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *, + integer *), dlacpy_(char *, integer *, integer *, doublereal *, + integer *, doublereal *, integer *), dlaset_(char *, + integer *, integer *, doublereal *, doublereal *, doublereal *, + integer *), xerbla_(char *, integer *); + integer mlwdmd, olwdmd; + logical sccolx, sccoly; + extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *, + integer *), dormqr_(char *, char *, integer *, integer *, integer + *, doublereal *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *, integer *); + integer iminwr; + logical wntvec, wntvcf; + integer mlwgqr; + logical wntref; + integer mlwork, olwgqr, olwork; + doublereal rdummy[2]; + integer mlwmqr, olwmqr; + logical lquery, wntres, wnttrf, wntvcq; + doublereal one; + +/* March 2023 */ +/* ..... */ +/* USE iso_fortran_env */ +/* INTEGER, PARAMETER :: WP = real64 */ +/* ..... */ +/* Scalar arguments */ +/* Array arguments */ +/* ..... */ +/* Purpose */ +/* ======= */ +/* DGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */ +/* a pair of data snapshot matrices, using a QR factorization */ +/* based compression of the data. For the input matrices */ +/* X and Y such that Y = A*X with an unaccessible matrix */ +/* A, DGEDMDQ computes a certain number of Ritz pairs of A using */ +/* the standard Rayleigh-Ritz extraction from a subspace of */ +/* range(X) that is determined using the leading left singular */ +/* vectors of X. Optionally, DGEDMDQ returns the residuals */ +/* of the computed Ritz pairs, the information needed for */ +/* a refinement of the Ritz vectors, or the eigenvectors of */ +/* the Exact DMD. */ +/* For further details see the references listed */ +/* below. For more details of the implementation see [3]. */ + +/* References */ +/* ========== */ +/* [1] P. Schmid: Dynamic mode decomposition of numerical */ +/* and experimental data, */ +/* Journal of Fluid Mechanics 656, 5-28, 2010. */ +/* [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */ +/* decompositions: analysis and enhancements, */ +/* SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */ +/* [3] Z. Drmac: A LAPACK implementation of the Dynamic */ +/* Mode Decomposition I. Technical report. AIMDyn Inc. */ +/* and LAPACK Working Note 298. */ +/* [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */ +/* Brunton, N. Kutz: On Dynamic Mode Decomposition: */ +/* Theory and Applications, Journal of Computational */ +/* Dynamics 1(2), 391 -421, 2014. */ + +/* Developed and supported by: */ +/* =========================== */ +/* Developed and coded by Zlatko Drmac, Faculty of Science, */ +/* University of Zagreb; drmac@math.hr */ +/* In cooperation with */ +/* AIMdyn Inc., Santa Barbara, CA. */ +/* and supported by */ +/* - DARPA SBIR project "Koopman Operator-Based Forecasting */ +/* for Nonstationary Processes from Near-Term, Limited */ +/* Observational Data" Contract No: W31P4Q-21-C-0007 */ +/* - DARPA PAI project "Physics-Informed Machine Learning */ +/* Methodologies" Contract No: HR0011-18-9-0033 */ +/* - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */ +/* Framework for Space-Time Analysis of Process Dynamics" */ +/* Contract No: HR0011-16-C-0116 */ +/* Any opinions, findings and conclusions or recommendations */ +/* expressed in this material are those of the author and */ +/* do not necessarily reflect the views of the DARPA SBIR */ +/* Program Office. */ +/* ============================================================ */ +/* Distribution Statement A: */ +/* Approved for Public Release, Distribution Unlimited. */ +/* Cleared by DARPA on September 29, 2022 */ +/* ============================================================ */ +/* ...................................................................... */ +/* Arguments */ +/* ========= */ +/* JOBS (input) CHARACTER*1 */ +/* Determines whether the initial data snapshots are scaled */ +/* by a diagonal matrix. The data snapshots are the columns */ +/* of F. The leading N-1 columns of F are denoted X and the */ +/* trailing N-1 columns are denoted Y. */ +/* 'S' :: The data snapshots matrices X and Y are multiplied */ +/* with a diagonal matrix D so that X*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'C' :: The snapshots are scaled as with the 'S' option. */ +/* If it is found that an i-th column of X is zero */ +/* vector and the corresponding i-th column of Y is */ +/* non-zero, then the i-th column of Y is set to */ +/* zero and a warning flag is raised. */ +/* 'Y' :: The data snapshots matrices X and Y are multiplied */ +/* by a diagonal matrix D so that Y*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'N' :: No data scaling. */ +/* ..... */ +/* JOBZ (input) CHARACTER*1 */ +/* Determines whether the eigenvectors (Koopman modes) will */ +/* be computed. */ +/* 'V' :: The eigenvectors (Koopman modes) will be computed */ +/* and returned in the matrix Z. */ +/* See the description of Z. */ +/* 'F' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product Z*V, where Z */ +/* is orthonormal and V contains the eigenvectors */ +/* of the corresponding Rayleigh quotient. */ +/* See the descriptions of F, V, Z. */ +/* 'Q' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product Q*Z, where Z */ +/* contains the eigenvectors of the compression of the */ +/* underlying discretized operator onto the span of */ +/* the data snapshots. See the descriptions of F, V, Z. */ +/* Q is from the initial QR factorization. */ +/* 'N' :: The eigenvectors are not computed. */ +/* ..... */ +/* JOBR (input) CHARACTER*1 */ +/* Determines whether to compute the residuals. */ +/* 'R' :: The residuals for the computed eigenpairs will */ +/* be computed and stored in the array RES. */ +/* See the description of RES. */ +/* For this option to be legal, JOBZ must be 'V'. */ +/* 'N' :: The residuals are not computed. */ +/* ..... */ +/* JOBQ (input) CHARACTER*1 */ +/* Specifies whether to explicitly compute and return the */ +/* orthogonal matrix from the QR factorization. */ +/* 'Q' :: The matrix Q of the QR factorization of the data */ +/* snapshot matrix is computed and stored in the */ +/* array F. See the description of F. */ +/* 'N' :: The matrix Q is not explicitly computed. */ +/* ..... */ +/* JOBT (input) CHARACTER*1 */ +/* Specifies whether to return the upper triangular factor */ +/* from the QR factorization. */ +/* 'R' :: The matrix R of the QR factorization of the data */ +/* snapshot matrix F is returned in the array Y. */ +/* See the description of Y and Further details. */ +/* 'N' :: The matrix R is not returned. */ +/* ..... */ +/* JOBF (input) CHARACTER*1 */ +/* Specifies whether to store information needed for post- */ +/* processing (e.g. computing refined Ritz vectors) */ +/* 'R' :: The matrix needed for the refinement of the Ritz */ +/* vectors is computed and stored in the array B. */ +/* See the description of B. */ +/* 'E' :: The unscaled eigenvectors of the Exact DMD are */ +/* computed and returned in the array B. See the */ +/* description of B. */ +/* 'N' :: No eigenvector refinement data is computed. */ +/* To be useful on exit, this option needs JOBQ='Q'. */ +/* ..... */ +/* WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */ +/* Allows for a selection of the SVD algorithm from the */ +/* LAPACK library. */ +/* 1 :: DGESVD (the QR SVD algorithm) */ +/* 2 :: DGESDD (the Divide and Conquer algorithm; if enough */ +/* workspace available, this is the fastest option) */ +/* 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 */ +/* are the most accurate options) */ +/* 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 */ +/* are the most accurate options) */ +/* For the four methods above, a significant difference in */ +/* the accuracy of small singular values is possible if */ +/* the snapshots vary in norm so that X is severely */ +/* ill-conditioned. If small (smaller than EPS*||X||) */ +/* singular values are of interest and JOBS=='N', then */ +/* the options (3, 4) give the most accurate results, where */ +/* the option 4 is slightly better and with stronger */ +/* theoretical background. */ +/* If JOBS=='S', i.e. the columns of X will be normalized, */ +/* then all methods give nearly equally accurate results. */ +/* ..... */ +/* M (input) INTEGER, M >= 0 */ +/* The state space dimension (the number of rows of F). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshots from a single trajectory, */ +/* taken at equidistant discrete times. This is the */ +/* number of columns of F. */ +/* ..... */ +/* F (input/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, */ +/* the columns of F are the sequence of data snapshots */ +/* from a single trajectory, taken at equidistant discrete */ +/* times. It is assumed that the column norms of F are */ +/* in the range of the normalized floating point numbers. */ +/* < On exit, */ +/* If JOBQ == 'Q', the array F contains the orthogonal */ +/* matrix/factor of the QR factorization of the initial */ +/* data snapshots matrix F. See the description of JOBQ. */ +/* If JOBQ == 'N', the entries in F strictly below the main */ +/* diagonal contain, column-wise, the information on the */ +/* Householder vectors, as returned by DGEQRF. The */ +/* remaining information to restore the orthogonal matrix */ +/* of the initial QR factorization is stored in WORK(1:N). */ +/* See the description of WORK. */ +/* ..... */ +/* LDF (input) INTEGER, LDF >= M */ +/* The leading dimension of the array F. */ +/* ..... */ +/* X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* X is used as workspace to hold representations of the */ +/* leading N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, the leading K columns of X contain the leading */ +/* K left singular vectors of the above described content */ +/* of X. To lift them to the space of the left singular */ +/* vectors U(:,1:K)of the input data, pre-multiply with the */ +/* Q factor from the initial QR factorization. */ +/* See the descriptions of F, K, V and Z. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= N */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* Y is used as workspace to hold representations of the */ +/* trailing N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, */ +/* If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */ +/* triangular factor from the QR factorization of the data */ +/* snapshot matrix F. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= N */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N-1 :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the description of K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the SVD/POD basis for the leading N-1 */ +/* data snapshots (columns of F) and the number of the */ +/* computed Ritz pairs. The value of K is determined */ +/* according to the rule set by the parameters NRNK and */ +/* TOL. See the descriptions of NRNK and TOL. */ +/* ..... */ +/* REIG (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K<=N) entries of REIG contain */ +/* the real parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* See the descriptions of K, IMEIG, Z. */ +/* ..... */ +/* IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K0, then the corresponding complex */ +/* conjugate pair of eigenvalues reads */ +/* LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) */ +/* LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */ +/* That is, complex conjugate pairs have consequtive */ +/* indices (i,i+1), with the positive imaginary part */ +/* listed first. */ +/* See the descriptions of K, REIG, Z. */ +/* ..... */ +/* Z (workspace/output) REAL(KIND=WP) M-by-(N-1) array */ +/* If JOBZ =='V' then */ +/* Z contains real Ritz vectors as follows: */ +/* If IMEIG(i)=0, then Z(:,i) is an eigenvector of */ +/* the i-th Ritz value. */ +/* If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */ +/* [Z(:,i) Z(:,i+1)] span an invariant subspace and */ +/* the Ritz values extracted from this subspace are */ +/* REIG(i) + sqrt(-1)*IMEIG(i) and */ +/* REIG(i) - sqrt(-1)*IMEIG(i). */ +/* The corresponding eigenvectors are */ +/* Z(:,i) + sqrt(-1)*Z(:,i+1) and */ +/* Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */ +/* If JOBZ == 'F', then the above descriptions hold for */ +/* the columns of Z*V, where the columns of V are the */ +/* eigenvectors of the K-by-K Rayleigh quotient, and Z is */ +/* orthonormal. The columns of V are similarly structured: */ +/* If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if */ +/* IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and */ +/* Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) */ +/* are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */ +/* See the descriptions of REIG, IMEIG, X and V. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs. */ +/* If LAMBDA(i) is real, then */ +/* RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */ +/* If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */ +/* then */ +/* RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */ +/* where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */ +/* [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */ +/* It holds that */ +/* RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 */ +/* RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */ +/* where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) */ +/* ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) */ +/* See the description of Z. */ +/* ..... */ +/* B (output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array. */ +/* IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:N,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* In both cases, the content of B can be lifted to the */ +/* original dimension of the input data by pre-multiplying */ +/* with the Q factor from the initial QR factorization. */ +/* Here A denotes a compression of the underlying operator. */ +/* See the descriptions of F and X. */ +/* If JOBF =='N', then B is not referenced. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= MIN(M,N) */ +/* The leading dimension of the array B. */ +/* ..... */ +/* V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array */ +/* On exit, V(1:K,1:K) contains the K eigenvectors of */ +/* the Rayleigh quotient. The eigenvectors of a complex */ +/* conjugate pair of eigenvalues are returned in real form */ +/* as explained in the description of Z. The Ritz vectors */ +/* (returned in Z) are the product of X and V; see */ +/* the descriptions of X and Z. */ +/* ..... */ +/* LDV (input) INTEGER, LDV >= N-1 */ +/* The leading dimension of the array V. */ +/* ..... */ +/* S (output) REAL(KIND=WP) (N-1)-by-(N-1) array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by DGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N-1 */ +/* The leading dimension of the array S. */ +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* WORK(1:MIN(M,N)) contains the scalar factors of the */ +/* elementary reflectors as returned by DGEQRF of the */ +/* M-by-N input matrix F. */ +/* WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of */ +/* the input submatrix F(1:M,1:N-1). */ +/* If the call to DGEDMDQ is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. Hence, the */ +/* length of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is calculated as follows: */ +/* Let MLWQR = N (minimal workspace for DGEQRF[M,N]) */ +/* MLWDMD = minimal workspace for DGEDMD (see the */ +/* description of LWORK in DGEDMD) for */ +/* snapshots of dimensions MIN(M,N)-by-(N-1) */ +/* MLWMQR = N (minimal workspace for */ +/* DORMQR['L','N',M,N,N]) */ +/* MLWGQR = N (minimal workspace for DORGQR[M,N,N]) */ +/* Then */ +/* LWORK = MAX(N+MLWQR, N+MLWDMD) */ +/* is updated as follows: */ +/* if JOBZ == 'V' or JOBZ == 'F' THEN */ +/* LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWMQR ) */ +/* if JOBQ == 'Q' THEN */ +/* LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWGQR) */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* Let M1=MIN(M,N), N1=N-1. Then */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ + +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* Local array */ +/* ~~~~~~~~~~~ */ + +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ + +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* External subroutines */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* .......................................................... */ + /* Parameter adjustments */ + f_dim1 = *ldf; + f_offset = 1 + f_dim1 * 1; + f -= f_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --reig; + --imeig; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + v_dim1 = *ldv; + v_offset = 1 + v_dim1 * 1; + v -= v_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --work; + --iwork; + + /* Function Body */ + zero = 0.f; + one = 1.f; + +/* Test the input arguments */ + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntvcf = lsame_(jobz, "F"); + wntvcq = lsame_(jobz, "Q"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + wantq = lsame_(jobq, "Q"); + wnttrf = lsame_(jobt, "R"); + minmn = f2cmin(*m,*n); + *info = 0; + lquery = *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && lsame_(jobz, "N")) { + *info = -3; + } else if (! (wantq || lsame_(jobq, "N"))) { + *info = -4; + } else if (! (wnttrf || lsame_(jobt, "N"))) { + *info = -5; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -6; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -7; + } else if (*m < 0) { + *info = -8; + } else if (*n < 0 || *n > *m + 1) { + *info = -9; + } else if (*ldf < *m) { + *info = -11; + } else if (*ldx < minmn) { + *info = -13; + } else if (*ldy < minmn) { + *info = -15; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -16; + } else if (*tol < zero || *tol >= one) { + *info = -17; + } else if (*ldz < *m) { + *info = -22; + } else if ((wntref || wntex) && *ldb < minmn) { + *info = -25; + } else if (*ldv < *n - 1) { + *info = -27; + } else if (*lds < *n - 1) { + *info = -29; + } + + if (wntvec || wntvcf || wntvcq) { + *(unsigned char *)jobvl = 'V'; + } else { + *(unsigned char *)jobvl = 'N'; + } + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0 || *n == 1) { +/* All output except K is void. INFO=1 signals */ +/* the void input. In case of a workspace query, */ +/* the minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + work[1] = 2.; + work[2] = 2.; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlwqr = f2cmax(1,*n); +/* Minimal workspace length for DGEQRF. */ + mlwork = minmn + mlwqr; + if (lquery) { + dgeqrf_(m, n, &f[f_offset], ldf, &work[1], rdummy, &c_n1, &info1); + olwqr = (integer) rdummy[0]; + olwork = f2cmin(*m,*n) + olwqr; + } + i__1 = *n - 1; + dgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], + ldx, &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], & + z__[z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], + ldv, &s[s_offset], lds, &work[1], &c_n1, &iwork[1], liwork, & + info1); + mlwdmd = (integer) work[1]; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwdmd; + mlwork = f2cmax(i__1,i__2); + iminwr = iwork[1]; + if (lquery) { + olwdmd = (integer) work[2]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwdmd; + olwork = f2cmax(i__1,i__2); + } + if (wntvec || wntvcf) { + mlwmqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + *n - 1 + mlwmqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + dormqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &work[1], & + z__[z_offset], ldz, &work[1], &c_n1, &info1); + olwmqr = (integer) work[1]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + *n - 1 + olwmqr; + olwork = f2cmax(i__1,i__2); + } + } + if (wantq) { + mlwgqr = *n; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + *n - 1 + mlwgqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + dorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[ + 1], &c_n1, &info1); + olwgqr = (integer) work[1]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + *n - 1 + olwgqr; + olwork = f2cmax(i__1,i__2); + } + } + iminwr = f2cmax(1,iminwr); + mlwork = f2cmax(2,mlwork); + if (*lwork < mlwork && ! lquery) { + *info = -31; + } + if (*liwork < iminwr && ! lquery) { + *info = -33; + } + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGEDMDQ", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + work[1] = (doublereal) mlwork; + work[2] = (doublereal) olwork; + return 0; + } +/* ..... */ +/* Initial QR factorization that is used to represent the */ +/* snapshots as elements of lower dimensional subspace. */ +/* For large scale computation with M >>N , at this place */ +/* one can use an out of core QRF. */ + + i__1 = *lwork - minmn; + dgeqrf_(m, n, &f[f_offset], ldf, &work[1], &work[minmn + 1], &i__1, & + info1); + +/* Define X and Y as the snapshots representations in the */ +/* orthogonal basis computed in the QR factorization. */ +/* X corresponds to the leading N-1 and Y to the trailing */ +/* N-1 snapshots. */ + i__1 = *n - 1; + dlaset_("L", &minmn, &i__1, &zero, &zero, &x[x_offset], ldx); + i__1 = *n - 1; + dlacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx); + i__1 = *n - 1; + dlacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy); + if (*m >= 3) { + i__1 = minmn - 2; + i__2 = *n - 2; + dlaset_("L", &i__1, &i__2, &zero, &zero, &y[y_dim1 + 3], ldy); + } + +/* Compute the DMD of the projected snapshot pairs (X,Y) */ + i__1 = *n - 1; + i__2 = *lwork - minmn; + dgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx, + &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &z__[ + z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[ + s_offset], lds, &work[minmn + 1], &i__2, &iwork[1], liwork, & + info1); + if (info1 == 2 || info1 == 3) { +/* Return with error code. See DGEDMD for details. */ + *info = info1; + return 0; + } else { + *info = info1; + } + +/* The Ritz vectors (Koopman modes) can be explicitly */ +/* formed or returned in factored form. */ + if (wntvec) { +/* Compute the eigenvectors explicitly. */ + if (*m > minmn) { + i__1 = *m - minmn; + dlaset_("A", &i__1, k, &zero, &zero, &z__[minmn + 1 + z_dim1], + ldz); + } + i__1 = *lwork - (minmn + *n - 1); + dormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[ + z_offset], ldz, &work[minmn + *n], &i__1, &info1); + } else if (wntvcf) { +/* Return the Ritz vectors (eigenvectors) in factored */ +/* form Z*V, where Z contains orthonormal matrix (the */ +/* product of Q from the initial QR factorization and */ +/* the SVD/POD_basis returned by DGEDMD in X) and the */ +/* second factor (the eigenvectors of the Rayleigh */ +/* quotient) is in the array V, as returned by DGEDMD. */ + dlacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz); + if (*m > *n) { + i__1 = *m - *n; + dlaset_("A", &i__1, k, &zero, &zero, &z__[*n + 1 + z_dim1], ldz); + } + i__1 = *lwork - (minmn + *n - 1); + dormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[ + z_offset], ldz, &work[minmn + *n], &i__1, &info1); + } + +/* Some optional output variables: */ + +/* The upper triangular factor R in the initial QR */ +/* factorization is optionally returned in the array Y. */ +/* This is useful if this call to DGEDMDQ is to be */ +/* followed by a streaming DMD that is implemented in a */ +/* QR compressed form. */ + if (wnttrf) { +/* Return the upper triangular R in Y */ + dlaset_("A", &minmn, n, &zero, &zero, &y[y_offset], ldy); + dlacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy); + } + +/* The orthonormal/orthogonal factor Q in the initial QR */ +/* factorization is optionally returned in the array F. */ +/* Same as with the triangular factor above, this is */ +/* useful in a streaming DMD. */ + if (wantq) { +/* Q overwrites F */ + i__1 = *lwork - (minmn + *n - 1); + dorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[minmn + + *n], &i__1, &info1); + } + + return 0; + +} /* dgedmdq_ */ + diff --git a/lapack-netlib/SRC/sgedmd.c b/lapack-netlib/SRC/sgedmd.c index 447b23014..c8f3a5964 100644 --- a/lapack-netlib/SRC/sgedmd.c +++ b/lapack-netlib/SRC/sgedmd.c @@ -509,3 +509,1238 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static integer c_n1 = -1; +static integer c__1 = 1; +static integer c__0 = 0; +static integer c__2 = 2; + +/* Subroutine */ int sgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, + integer *whtsvd, integer *m, integer *n, real *x, integer *ldx, real * + y, integer *ldy, integer *nrnk, real *tol, integer *k, real *reig, + real *imeig, real *z__, integer *ldz, real *res, real *b, integer * + ldb, real *w, integer *ldw, real *s, integer *lds, real *work, + integer *lwork, integer *iwork, integer *liwork, integer *info) +{ + /* System generated locals */ + integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, + b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2; + real r__1, r__2; + + /* Local variables */ + real zero, ssum; + integer info1, info2; + real xscl1, xscl2; + extern real snrm2_(integer *, real *, integer *); + integer i__, j; + real scale; + extern logical lsame_(char *, char *); + extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + logical badxy; + real small; + extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *), sgeev_(char *, char *, + integer *, real *, integer *, real *, real *, real *, integer *, + real *, integer *, real *, integer *, integer *); + char jobzl[1]; + extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + real *, integer *); + logical wntex; + real ab[4] /* was [2][2] */; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + extern /* Subroutine */ int sgesdd_(char *, integer *, integer *, real *, + integer *, real *, real *, integer *, real *, integer *, real *, + integer *, integer *, integer *), xerbla_(char *, integer + *); + char t_or_n__[1]; + extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *); + extern integer isamax_(integer *, real *, integer *); + logical sccolx, sccoly; + extern logical sisnan_(real *); + extern /* Subroutine */ int sgesvd_(char *, char *, integer *, integer *, + real *, integer *, real *, real *, integer *, real *, integer *, + real *, integer *, integer *); + integer lwrsdd, mwrsdd; + extern /* Subroutine */ int sgejsv_(char *, char *, char *, char *, char * + , char *, integer *, integer *, real *, integer *, real *, real *, + integer *, real *, integer *, real *, integer *, integer *, + integer *), + slacpy_(char *, integer *, integer *, real *, integer *, real *, + integer *); + integer iminwr; + logical wntref, wntvec; + real rootsc; + integer lwrkev, mlwork, mwrkev, numrnk, olwork; + real rdummy[2]; + integer lwrsvd, mwrsvd; + logical lquery, wntres; + char jsvopt[1]; + extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + real *), mecago_(); + integer mwrsvj, lwrsvq, mwrsvq; + real rdummy2[2], ofl, one; + extern /* Subroutine */ int sgesvdq_(char *, char *, char *, char *, char + *, integer *, integer *, real *, integer *, real *, real *, + integer *, real *, integer *, integer *, integer *, integer *, + real *, integer *, real *, integer *, integer *); + +/* March 2023 */ +/* ..... */ +/* USE iso_fortran_env */ +/* INTEGER, PARAMETER :: WP = real32 */ +/* ..... */ +/* Scalar arguments */ +/* Array arguments */ +/* ............................................................ */ +/* Purpose */ +/* ======= */ +/* SGEDMD computes the Dynamic Mode Decomposition (DMD) for */ +/* a pair of data snapshot matrices. For the input matrices */ +/* X and Y such that Y = A*X with an unaccessible matrix */ +/* A, SGEDMD computes a certain number of Ritz pairs of A using */ +/* the standard Rayleigh-Ritz extraction from a subspace of */ +/* range(X) that is determined using the leading left singular */ +/* vectors of X. Optionally, SGEDMD returns the residuals */ +/* of the computed Ritz pairs, the information needed for */ +/* a refinement of the Ritz vectors, or the eigenvectors of */ +/* the Exact DMD. */ +/* For further details see the references listed */ +/* below. For more details of the implementation see [3]. */ + +/* References */ +/* ========== */ +/* [1] P. Schmid: Dynamic mode decomposition of numerical */ +/* and experimental data, */ +/* Journal of Fluid Mechanics 656, 5-28, 2010. */ +/* [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */ +/* decompositions: analysis and enhancements, */ +/* SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */ +/* [3] Z. Drmac: A LAPACK implementation of the Dynamic */ +/* Mode Decomposition I. Technical report. AIMDyn Inc. */ +/* and LAPACK Working Note 298. */ +/* [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */ +/* Brunton, N. Kutz: On Dynamic Mode Decomposition: */ +/* Theory and Applications, Journal of Computational */ +/* Dynamics 1(2), 391 -421, 2014. */ + +/* ...................................................................... */ +/* Developed and supported by: */ +/* =========================== */ +/* Developed and coded by Zlatko Drmac, Faculty of Science, */ +/* University of Zagreb; drmac@math.hr */ +/* In cooperation with */ +/* AIMdyn Inc., Santa Barbara, CA. */ +/* and supported by */ +/* - DARPA SBIR project "Koopman Operator-Based Forecasting */ +/* for Nonstationary Processes from Near-Term, Limited */ +/* Observational Data" Contract No: W31P4Q-21-C-0007 */ +/* - DARPA PAI project "Physics-Informed Machine Learning */ +/* Methodologies" Contract No: HR0011-18-9-0033 */ +/* - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */ +/* Framework for Space-Time Analysis of Process Dynamics" */ +/* Contract No: HR0011-16-C-0116 */ +/* Any opinions, findings and conclusions or recommendations */ +/* expressed in this material are those of the author and */ +/* do not necessarily reflect the views of the DARPA SBIR */ +/* Program Office */ +/* ============================================================ */ +/* Distribution Statement A: */ +/* Approved for Public Release, Distribution Unlimited. */ +/* Cleared by DARPA on September 29, 2022 */ +/* ============================================================ */ +/* ...................................................................... */ +/* Arguments */ +/* ========= */ +/* JOBS (input) CHARACTER*1 */ +/* Determines whether the initial data snapshots are scaled */ +/* by a diagonal matrix. */ +/* 'S' :: The data snapshots matrices X and Y are multiplied */ +/* with a diagonal matrix D so that X*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'C' :: The snapshots are scaled as with the 'S' option. */ +/* If it is found that an i-th column of X is zero */ +/* vector and the corresponding i-th column of Y is */ +/* non-zero, then the i-th column of Y is set to */ +/* zero and a warning flag is raised. */ +/* 'Y' :: The data snapshots matrices X and Y are multiplied */ +/* by a diagonal matrix D so that Y*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'N' :: No data scaling. */ +/* ..... */ +/* JOBZ (input) CHARACTER*1 */ +/* Determines whether the eigenvectors (Koopman modes) will */ +/* be computed. */ +/* 'V' :: The eigenvectors (Koopman modes) will be computed */ +/* and returned in the matrix Z. */ +/* See the description of Z. */ +/* 'F' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product X(:,1:K)*W, where X */ +/* contains a POD basis (leading left singular vectors */ +/* of the data matrix X) and W contains the eigenvectors */ +/* of the corresponding Rayleigh quotient. */ +/* See the descriptions of K, X, W, Z. */ +/* 'N' :: The eigenvectors are not computed. */ +/* ..... */ +/* JOBR (input) CHARACTER*1 */ +/* Determines whether to compute the residuals. */ +/* 'R' :: The residuals for the computed eigenpairs will be */ +/* computed and stored in the array RES. */ +/* See the description of RES. */ +/* For this option to be legal, JOBZ must be 'V'. */ +/* 'N' :: The residuals are not computed. */ +/* ..... */ +/* JOBF (input) CHARACTER*1 */ +/* Specifies whether to store information needed for post- */ +/* processing (e.g. computing refined Ritz vectors) */ +/* 'R' :: The matrix needed for the refinement of the Ritz */ +/* vectors is computed and stored in the array B. */ +/* See the description of B. */ +/* 'E' :: The unscaled eigenvectors of the Exact DMD are */ +/* computed and returned in the array B. See the */ +/* description of B. */ +/* 'N' :: No eigenvector refinement data is computed. */ +/* ..... */ +/* WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */ +/* Allows for a selection of the SVD algorithm from the */ +/* LAPACK library. */ +/* 1 :: SGESVD (the QR SVD algorithm) */ +/* 2 :: SGESDD (the Divide and Conquer algorithm; if enough */ +/* workspace available, this is the fastest option) */ +/* 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 */ +/* are the most accurate options) */ +/* 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 */ +/* are the most accurate options) */ +/* For the four methods above, a significant difference in */ +/* the accuracy of small singular values is possible if */ +/* the snapshots vary in norm so that X is severely */ +/* ill-conditioned. If small (smaller than EPS*||X||) */ +/* singular values are of interest and JOBS=='N', then */ +/* the options (3, 4) give the most accurate results, where */ +/* the option 4 is slightly better and with stronger */ +/* theoretical background. */ +/* If JOBS=='S', i.e. the columns of X will be normalized, */ +/* then all methods give nearly equally accurate results. */ +/* ..... */ +/* M (input) INTEGER, M>= 0 */ +/* The state space dimension (the row dimension of X, Y). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshot pairs */ +/* (the number of columns of X and Y). */ +/* ..... */ +/* X (input/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, X contains the data snapshot matrix X. It is */ +/* assumed that the column norms of X are in the range of */ +/* the normalized floating point numbers. */ +/* < On exit, the leading K columns of X contain a POD basis, */ +/* i.e. the leading K left singular vectors of the input */ +/* data matrix X, U(:,1:K). All N columns of X contain all */ +/* left singular vectors of the input matrix X. */ +/* See the descriptions of K, Z and W. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= M */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (input/workspace/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, Y contains the data snapshot matrix Y */ +/* < On exit, */ +/* If JOBR == 'R', the leading K columns of Y contain */ +/* the residual vectors for the computed Ritz pairs. */ +/* See the description of RES. */ +/* If JOBR == 'N', Y contains the original input data, */ +/* scaled according to the value of JOBS. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= M */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the descriptions of TOL and K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the POD basis for the data snapshot */ +/* matrix X and the number of the computed Ritz pairs. */ +/* The value of K is determined according to the rule set */ +/* by the parameters NRNK and TOL. */ +/* See the descriptions of NRNK and TOL. */ +/* ..... */ +/* REIG (output) REAL(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of REIG contain */ +/* the real parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* See the descriptions of K, IMEIG, and Z. */ +/* ..... */ +/* IMEIG (output) REAL(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of IMEIG contain */ +/* the imaginary parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* The eigenvalues are determined as follows: */ +/* If IMEIG(i) == 0, then the corresponding eigenvalue is */ +/* real, LAMBDA(i) = REIG(i). */ +/* If IMEIG(i)>0, then the corresponding complex */ +/* conjugate pair of eigenvalues reads */ +/* LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) */ +/* LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */ +/* That is, complex conjugate pairs have consecutive */ +/* indices (i,i+1), with the positive imaginary part */ +/* listed first. */ +/* See the descriptions of K, REIG, and Z. */ +/* ..... */ +/* Z (workspace/output) REAL(KIND=WP) M-by-N array */ +/* If JOBZ =='V' then */ +/* Z contains real Ritz vectors as follows: */ +/* If IMEIG(i)=0, then Z(:,i) is an eigenvector of */ +/* the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */ +/* [Z(:,i) Z(:,i+1)] span an invariant subspace and */ +/* the Ritz values extracted from this subspace are */ +/* REIG(i) + sqrt(-1)*IMEIG(i) and */ +/* REIG(i) - sqrt(-1)*IMEIG(i). */ +/* The corresponding eigenvectors are */ +/* Z(:,i) + sqrt(-1)*Z(:,i+1) and */ +/* Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */ +/* || Z(:,i:i+1)||_F = 1. */ +/* If JOBZ == 'F', then the above descriptions hold for */ +/* the columns of X(:,1:K)*W(1:K,1:K), where the columns */ +/* of W(1:k,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. The columns of W(1:K,1:K) */ +/* are similarly structured: If IMEIG(i) == 0 then */ +/* X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 */ +/* then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and */ +/* X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) */ +/* are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */ +/* See the descriptions of REIG, IMEIG, X and W. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) N-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs. */ +/* If LAMBDA(i) is real, then */ +/* RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */ +/* If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */ +/* then */ +/* RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */ +/* where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */ +/* [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */ +/* It holds that */ +/* RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 */ +/* RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */ +/* where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) */ +/* ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) */ +/* See the description of REIG, IMEIG and Z. */ +/* ..... */ +/* B (output) REAL(KIND=WP) M-by-N array. */ +/* IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:M,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* If JOBF =='N', then B is not referenced. */ +/* See the descriptions of X, W, K. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= M */ +/* The leading dimension of the array B. */ +/* ..... */ +/* W (workspace/output) REAL(KIND=WP) N-by-N array */ +/* On exit, W(1:K,1:K) contains the K computed */ +/* eigenvectors of the matrix Rayleigh quotient (real and */ +/* imaginary parts for each complex conjugate pair of the */ +/* eigenvalues). The Ritz vectors (returned in Z) are the */ +/* product of X (containing a POD basis for the input */ +/* matrix X) and W. See the descriptions of K, S, X and Z. */ +/* W is also used as a workspace to temporarily store the */ +/* left singular vectors of X. */ +/* ..... */ +/* LDW (input) INTEGER, LDW >= N */ +/* The leading dimension of the array W. */ +/* ..... */ +/* S (workspace/output) REAL(KIND=WP) N-by-N array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by SGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N */ +/* The leading dimension of the array S. */ +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, WORK(1:N) contains the singular values of */ +/* X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */ +/* If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain */ +/* scaling factor WORK(N+2)/WORK(N+1) used to scale X */ +/* and Y to avoid overflow in the SVD of X. */ +/* This may be of interest if the scaling option is off */ +/* and as many as possible smallest eigenvalues are */ +/* desired to the highest feasible accuracy. */ +/* If the call to SGEDMD is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. Hence, the */ +/* length of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is calculated as follows: */ +/* If WHTSVD == 1 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). */ +/* If JOBZ == 'N' then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). */ +/* Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal */ +/* workspace length of SGESVD. */ +/* If WHTSVD == 2 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) */ +/* Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the */ +/* minimal workspace length of SGESDD. */ +/* If WHTSVD == 3 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */ +/* Here LWORK_SVD = N+M+MAX(3*N+1, */ +/* MAX(1,3*N+M,5*N),MAX(1,N)) */ +/* is the minimal workspace length of SGESVDQ. */ +/* If WHTSVD == 4 :: */ +/* If JOBZ == 'V', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */ +/* If JOBZ == 'N', then */ +/* LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */ +/* Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the */ +/* minimal workspace length of SGEJSV. */ +/* The above expressions are not simplified in order to */ +/* make the usage of WORK more transparent, and for */ +/* easier checking. In any case, LWORK >= 2. */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ +/* Local arrays */ +/* ~~~~~~~~~~~~ */ +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* ............................................................ */ + /* Parameter adjustments */ + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --reig; + --imeig; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + w_dim1 = *ldw; + w_offset = 1 + w_dim1 * 1; + w -= w_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --work; + --iwork; + + /* Function Body */ + one = 1.f; + zero = 0.f; + +/* Test the input arguments */ + + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + *info = 0; + lquery = *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || lsame_(jobz, "N") || lsame_( + jobz, "F"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && ! wntvec) { + *info = -3; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -4; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -5; + } else if (*m < 0) { + *info = -6; + } else if (*n < 0 || *n > *m) { + *info = -7; + } else if (*ldx < *m) { + *info = -9; + } else if (*ldy < *m) { + *info = -11; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -12; + } else if (*tol < zero || *tol >= one) { + *info = -13; + } else if (*ldz < *m) { + *info = -18; + } else if ((wntref || wntex) && *ldb < *m) { + *info = -21; + } else if (*ldw < *n) { + *info = -23; + } else if (*lds < *n) { + *info = -25; + } + + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0) { +/* Quick return. All output except K is void. */ +/* INFO=1 signals the void input. */ +/* In case of a workspace query, the default */ +/* minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + work[1] = 2.f; + work[2] = 2.f; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlwork = f2cmax(2,*n); + olwork = f2cmax(2,*n); + iminwr = 1; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of SGESVD: */ +/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */ +/* Computing MAX */ + i__1 = 1, i__2 = f2cmin(*m,*n) * 3 + f2cmax(*m,*n), i__1 = f2cmax(i__1, + i__2), i__2 = f2cmin(*m,*n) * 5; + mwrsvd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvd; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + sgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[ + b_offset], ldb, &w[w_offset], ldw, rdummy, &c_n1, & + info1); +/* Computing MAX */ + i__1 = mwrsvd, i__2 = (integer) rdummy[0]; + lwrsvd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsvd; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 2) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of SGESDD: */ +/* MWRSDD = 3*MIN(M,N)*MIN(M,N) + */ +/* MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) */ +/* IMINWR = 8*MIN(M,N) */ +/* Computing MAX */ + i__1 = f2cmax(*m,*n), i__2 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + (f2cmin(*m,* + n) << 2); + mwrsdd = f2cmin(*m,*n) * 3 * f2cmin(*m,*n) + f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsdd; + mlwork = f2cmax(i__1,i__2); + iminwr = f2cmin(*m,*n) << 3; + if (lquery) { + sgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], + ldb, &w[w_offset], ldw, rdummy, &c_n1, &iwork[1], & + info1); +/* Computing MAX */ + i__1 = mwrsdd, i__2 = (integer) rdummy[0]; + lwrsdd = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsdd; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 3) { +/* LWQP3 = 3*N+1 */ +/* LWORQ = MAX(N, 1) */ +/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */ +/* MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ )+ MAX(M,2) */ +/* MLWORK = N + MWRSVQ */ +/* IMINWR = M+N-1 */ + sgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[ + 1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, & + iwork[1], &c_n1, rdummy, &c_n1, rdummy2, &c_n1, &info1); + iminwr = iwork[1]; + mwrsvq = (integer) rdummy[1]; +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvq + (integer) rdummy2[0]; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + lwrsvq = (integer) rdummy[0]; +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrsvq + (integer) rdummy2[0]; + olwork = f2cmax(i__1,i__2); + } + } else if (*whtsvd == 4) { + *(unsigned char *)jsvopt = 'J'; +/* MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N )! for JSVOPT='V' */ +/* Computing MAX */ + i__1 = 7, i__2 = (*m << 1) + *n, i__1 = f2cmax(i__1,i__2), i__2 = (* + n << 2) + *n * *n, i__1 = f2cmax(i__1,i__2), i__2 = (*n << 1) + + *n * *n + 6; + mwrsvj = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrsvj; + mlwork = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 3, i__2 = *m + *n * 3; + iminwr = f2cmax(i__1,i__2); + if (lquery) { +/* Computing MAX */ + i__1 = olwork, i__2 = *n + mwrsvj; + olwork = f2cmax(i__1,i__2); + } + } +/* END SELECT */ + if (wntvec || wntex || lsame_(jobz, "F")) { + *(unsigned char *)jobzl = 'V'; + } else { + *(unsigned char *)jobzl = 'N'; + } +/* Workspace calculation to the SGEEV call */ + if (lsame_(jobzl, "V")) { +/* Computing MAX */ + i__1 = 1, i__2 = *n << 2; + mwrkev = f2cmax(i__1,i__2); + } else { +/* Computing MAX */ + i__1 = 1, i__2 = *n * 3; + mwrkev = f2cmax(i__1,i__2); + } +/* Computing MAX */ + i__1 = mlwork, i__2 = *n + mwrkev; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + sgeev_("N", jobzl, n, &s[s_offset], lds, &reig[1], &imeig[1], &w[ + w_offset], ldw, &w[w_offset], ldw, rdummy, &c_n1, &info1); +/* Computing MAX */ + i__1 = mwrkev, i__2 = (integer) rdummy[0]; + lwrkev = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = olwork, i__2 = *n + lwrkev; + olwork = f2cmax(i__1,i__2); + } + + if (*liwork < iminwr && ! lquery) { + *info = -29; + } + if (*lwork < mlwork && ! lquery) { + *info = -27; + } + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGEDMD", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + work[1] = (real) mlwork; + work[2] = (real) olwork; + return 0; + } +/* ............................................................ */ + + ofl = slamch_("O"); + small = slamch_("S"); + badxy = FALSE_; + +/* <1> Optional scaling of the snapshots (columns of X, Y) */ +/* ========================================================== */ + if (sccolx) { +/* The columns of X will be normalized. */ +/* To prevent overflows, the column norms of X are */ +/* carefully computed using SLASSQ. */ + *k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = DNRM2( M, X(1,i), 1 ) */ + scale = zero; + slassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum); + if (sisnan_(&scale) || sisnan_(&ssum)) { + *k = 0; + *info = -8; + i__2 = -(*info); + xerbla_("SGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of X(:,i) overflows. First, X(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */ +/* Next, the norm of X(:,i) is stored without */ +/* overflow as WORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of X(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + r__1 = one / rootsc; + slascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &x[ + i__ * x_dim1 + 1], m, &info2); + work[i__] = -scale * (rootsc / (real) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + work[i__] = scale * rootsc; + slascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &x[ + i__ * x_dim1 + 1], m, &info2); +/* X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC */ +/* LAPACK */ + } + } else { + work[i__] = zero; + ++(*k); + } + } + if (*k == *n) { +/* All columns of X are zero. Return error code -8. */ +/* (the 8th input variable had an illegal value) */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("SGEDMD", &i__1); + return 0; + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of Y. */ + if (work[i__] > zero) { + r__1 = one / work[i__]; + sscal_(m, &r__1, &y[i__ * y_dim1 + 1], &c__1); +/* Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (work[i__] < zero) { + r__1 = -work[i__]; + r__2 = one / (real) (*m); + slascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &y[i__ * + y_dim1 + 1], m, &info2); +/* LAPACK CA */ + } else if (y[isamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ * + y_dim1] != zero) { +/* X(:,i) is zero vector. For consistency, */ +/* Y(:,i) should also be zero. If Y(:,i) is not */ +/* zero, then the data might be inconsistent or */ +/* corrupted. If JOBS == 'C', Y(:,i) is set to */ +/* zero and a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + if (lsame_(jobs, "C")) { + sscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1); + } +/* BLAS CALL */ + } + } + } + + if (sccoly) { +/* The columns of Y will be normalized. */ +/* To prevent overflows, the column norms of Y are */ +/* carefully computed using SLASSQ. */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = DNRM2( M, Y(1,i), 1 ) */ + scale = zero; + slassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum); + if (sisnan_(&scale) || sisnan_(&ssum)) { + *k = 0; + *info = -10; + i__2 = -(*info); + xerbla_("SGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of Y(:,i) overflows. First, Y(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */ +/* Next, the norm of Y(:,i) is stored without */ +/* overflow as WORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of Y(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + r__1 = one / rootsc; + slascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &y[ + i__ * y_dim1 + 1], m, &info2); + work[i__] = -scale * (rootsc / (real) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + work[i__] = scale * rootsc; + slascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &y[ + i__ * y_dim1 + 1], m, &info2); +/* Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* LAPACK */ + } + } else { + work[i__] = zero; + } + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of X. */ + if (work[i__] > zero) { + r__1 = one / work[i__]; + sscal_(m, &r__1, &x[i__ * x_dim1 + 1], &c__1); +/* X(1:M,i) = (ONE/WORK(i)) * X(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (work[i__] < zero) { + r__1 = -work[i__]; + r__2 = one / (real) (*m); + slascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &x[i__ * + x_dim1 + 1], m, &info2); +/* LAPACK CA */ + } else if (x[isamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ * + x_dim1] != zero) { +/* Y(:,i) is zero vector. If X(:,i) is not */ +/* zero, then a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + } + } + } + +/* <2> SVD of the data snapshot matrix X. */ +/* ===================================== */ +/* The left singular vectors are stored in the array X. */ +/* The right singular vectors are in the array W. */ +/* The array W will later on contain the eigenvectors */ +/* of a Rayleigh quotient. */ + numrnk = *n; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { + i__1 = *lwork - *n; + sgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], + ldb, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1); +/* LAPACK CAL */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 2) { + i__1 = *lwork - *n; + sgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], ldb, &w[ + w_offset], ldw, &work[*n + 1], &i__1, &iwork[1], &info1); +/* LAPACK CAL */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 3) { + i__1 = *lwork - *n - f2cmax(2,*m); + i__2 = f2cmax(2,*m); + sgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[1], & + z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], + liwork, &work[*n + f2cmax(2,*m) + 1], &i__1, &work[*n + 1], & + i__2, &info1); + + slacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK C */ + *(unsigned char *)t_or_n__ = 'T'; + } else if (*whtsvd == 4) { + i__1 = *lwork - *n; + sgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, & + work[1], &z__[z_offset], ldz, &w[w_offset], ldw, &work[*n + 1] + , &i__1, &iwork[1], &info1); +/* LAPACK CALL */ + slacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK CALL */ + *(unsigned char *)t_or_n__ = 'N'; + xscl1 = work[*n + 1]; + xscl2 = work[*n + 2]; + if (xscl1 != xscl2) { +/* This is an exceptional situation. If the */ +/* data matrices are not scaled and the */ +/* largest singular value of X overflows. */ +/* In that case SGEJSV can return the SVD */ +/* in scaled form. The scaling factor can be used */ +/* to rescale the data (X and Y). */ + slascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], + ldy, &info2); + } +/* END SELECT */ + } + + if (info1 > 0) { +/* The SVD selected subroutine did not converge. */ +/* Return with an error code. */ + *info = 2; + return 0; + } + + if (work[1] == zero) { +/* The largest computed singular value of (scaled) */ +/* X is zero. Return error code -8 */ +/* (the 8th input variable had an illegal value). */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("SGEDMD", &i__1); + return 0; + } + +/* <3> Determine the numerical rank of the data */ +/* snapshots matrix X. This depends on the */ +/* parameters NRNK and TOL. */ +/* SELECT CASE ( NRNK ) */ + if (*nrnk == -1) { + *k = 1; + i__1 = numrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (work[i__] <= work[1] * *tol || work[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else if (*nrnk == -2) { + *k = 1; + i__1 = numrnk - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + if (work[i__ + 1] <= work[i__] * *tol || work[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else { + *k = 1; + i__1 = *nrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (work[i__] <= small) { + myexit_(); + } + ++(*k); + } +/* END SELECT */ + } +/* Now, U = X(1:M,1:K) is the SVD/POD basis for the */ +/* snapshot data in the input matrix X. */ +/* <4> Compute the Rayleigh quotient S = U^T * A * U. */ +/* Depending on the requested outputs, the computation */ +/* is organized to compute additional auxiliary */ +/* matrices (for the residuals and refinements). */ + +/* In all formulas below, we need V_k*Sigma_k^(-1) */ +/* where either V_k is in W(1:N,1:K), or V_k^T is in */ +/* W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */ + if (lsame_(t_or_n__, "N")) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + r__1 = one / work[i__]; + sscal_(n, &r__1, &w[i__ * w_dim1 + 1], &c__1); +/* W(1:N,i) = (ONE/WORK(i)) * W(1:N,i) ! INTRINSIC */ +/* BLAS CALL */ + } + } else { +/* This non-unit stride access is due to the fact */ +/* that SGESVD, SGESVDQ and SGESDD return the */ +/* transposed matrix of the right singular vectors. */ +/* DO i = 1, K */ +/* CALL SSCAL( N, ONE/WORK(i), W(i,1), LDW ) ! BLAS CALL */ +/* ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N) ! INTRINSIC */ +/* END DO */ + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + work[*n + i__] = one / work[i__]; + } + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *k; + for (i__ = 1; i__ <= i__2; ++i__) { + w[i__ + j * w_dim1] = work[*n + i__] * w[i__ + j * w_dim1]; + } + } + } + + if (wntref) { + +/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */ +/* for computing the refined Ritz vectors */ +/* (optionally, outside SGEDMD). */ + sgemm_("N", t_or_n__, m, k, n, &one, &y[y_offset], ldy, &w[w_offset], + ldw, &zero, &z__[z_offset], ldz); +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRI */ +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! INTRI */ + +/* At this point Z contains */ +/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */ +/* this is needed for computing the residuals. */ +/* This matrix is returned in the array B and */ +/* it can be used to compute refined Ritz vectors. */ +/* BLAS */ + slacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); +/* B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC */ +/* BLAS CALL */ + sgemm_("T", "N", k, k, m, &one, &x[x_offset], ldx, &z__[z_offset], + ldz, &zero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */ +/* At this point S = U^T * A * U is the Rayleigh quotient. */ +/* BLAS */ + } else { +/* A * U(:,1:K) is not explicitly needed and the */ +/* computation is organized differently. The Rayleigh */ +/* quotient is computed more efficiently. */ + sgemm_("T", "N", k, n, m, &one, &x[x_offset], ldx, &y[y_offset], ldy, + &zero, &z__[z_offset], ldz); +/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) ) ! IN */ +/* In the two SGEMM calls here, can use K for LDZ */ +/* B */ + sgemm_("N", t_or_n__, k, k, n, &one, &z__[z_offset], ldz, &w[w_offset] + , ldw, &zero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */ +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! INTRIN */ +/* At this point S = U^T * A * U is the Rayleigh quotient. */ +/* If the residuals are requested, save scaled V_k into Z. */ +/* Recall that V_k or V_k^T is stored in W. */ +/* BLAS */ + if (wntres || wntex) { + if (lsame_(t_or_n__, "N")) { + slacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz); + } else { + slacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz); + } + } + } + +/* <5> Compute the Ritz values and (if requested) the */ +/* right eigenvectors of the Rayleigh quotient. */ + + i__1 = *lwork - *n; + sgeev_("N", jobzl, k, &s[s_offset], lds, &reig[1], &imeig[1], &w[w_offset] + , ldw, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1); + +/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */ +/* quotient. Even in the case of complex spectrum, all */ +/* computation is done in real arithmetic. REIG and */ +/* IMEIG are the real and the imaginary parts of the */ +/* eigenvalues, so that the spectrum is given as */ +/* REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs */ +/* are listed at consecutive positions. For such a */ +/* complex conjugate pair of the eigenvalues, the */ +/* corresponding eigenvectors are also a complex */ +/* conjugate pair with the real and imaginary parts */ +/* stored column-wise in W at the corresponding */ +/* consecutive column indices. See the description of Z. */ +/* Also, see the description of SGEEV. */ +/* LAPACK C */ + if (info1 > 0) { +/* SGEEV failed to compute the eigenvalues and */ +/* eigenvectors of the Rayleigh quotient. */ + *info = 3; + return 0; + } + +/* <6> Compute the eigenvectors (if requested) and, */ +/* the residuals (if requested). */ + + if (wntvec || wntex) { + if (wntres) { + if (wntref) { +/* Here, if the refinement is requested, we have */ +/* A*U(:,1:K) already computed and stored in Z. */ +/* For the residuals, need Y = A * U(:,1;K) * W. */ + sgemm_("N", "N", m, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &y[y_offset], ldy); +/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC */ +/* This frees Z; Y contains A * U(:,1:K) * W. */ +/* BLAS CALL */ + } else { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + sgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + sgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[ + s_offset], lds, &zero, &z__[z_offset], ldz); +/* Save a copy of Z into Y and free Z for holding */ +/* the Ritz vectors. */ + slacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy); + if (wntex) { + slacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); + } + } + } else if (wntex) { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + sgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + sgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[s_offset], + lds, &zero, &b[b_offset], ldb); +/* The above call replaces the following two calls */ +/* that were used in the developing-testing phase. */ +/* CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & */ +/* LDS, ZERO, Z, LDZ) */ +/* Save a copy of Z into B and free Z for holding */ +/* the Ritz vectors. */ +/* CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) */ + } + +/* Compute the real form of the Ritz vectors */ + if (wntvec) { + sgemm_("N", "N", m, k, k, &one, &x[x_offset], ldx, &w[w_offset], + ldw, &zero, &z__[z_offset], ldz); + } +/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC */ + +/* BLAS CALL */ + if (wntres) { + i__ = 1; + while(i__ <= *k) { + if (imeig[i__] == zero) { +/* have a real eigenvalue with real eigenvector */ + r__1 = -reig[i__]; + saxpy_(m, &r__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * + y_dim1 + 1], &c__1); +/* Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i) ! */ + + res[i__] = snrm2_(m, &y[i__ * y_dim1 + 1], &c__1); + ++i__; + } else { +/* Have a complex conjugate pair */ +/* REIG(i) +- sqrt(-1)*IMEIG(i). */ +/* Since all computation is done in real */ +/* arithmetic, the formula for the residual */ +/* is recast for real representation of the */ +/* complex conjugate eigenpair. See the */ +/* description of RES. */ + ab[0] = reig[i__]; + ab[1] = -imeig[i__]; + ab[2] = imeig[i__]; + ab[3] = reig[i__]; + r__1 = -one; + sgemm_("N", "N", m, &c__2, &c__2, &r__1, &z__[i__ * + z_dim1 + 1], ldz, ab, &c__2, &one, &y[i__ * + y_dim1 + 1], ldy); +/* Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INT */ +/* BL */ + res[i__] = slange_("F", m, &c__2, &y[i__ * y_dim1 + 1], + ldy, &work[*n + 1]); +/* LA */ + res[i__ + 1] = res[i__]; + i__ += 2; + } + } + } + } + + if (*whtsvd == 4) { + work[*n + 1] = xscl1; + work[*n + 2] = xscl2; + } + +/* Successful exit. */ + if (! badxy) { + *info = 0; + } else { +/* A warning on possible data inconsistency. */ +/* This should be a rare event. */ + *info = 4; + } +/* ............................................................ */ + return 0; +/* ...... */ +} /* sgedmd_ */ + diff --git a/lapack-netlib/SRC/sgedmdq.c b/lapack-netlib/SRC/sgedmdq.c index 447b23014..0adf3bda3 100644 --- a/lapack-netlib/SRC/sgedmdq.c +++ b/lapack-netlib/SRC/sgedmdq.c @@ -509,3 +509,788 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static integer c_n1 = -1; + +/* Subroutine */ int sgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, + char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, real + *f, integer *ldf, real *x, integer *ldx, real *y, integer *ldy, + integer *nrnk, real *tol, integer *k, real *reig, real *imeig, real * + z__, integer *ldz, real *res, real *b, integer *ldb, real *v, integer + *ldv, real *s, integer *lds, real *work, integer *lwork, integer * + iwork, integer *liwork, integer *info) +{ + /* System generated locals */ + integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, + z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, + i__1, i__2; + + /* Local variables */ + real zero; + integer info1; + extern logical lsame_(char *, char *); + char jobvl[1]; + integer minmn; + logical wantq; + integer mlwqr, olwqr; + logical wntex; + extern /* Subroutine */ int sgedmd_(char *, char *, char *, char *, + integer *, integer *, integer *, real *, integer *, real *, + integer *, integer *, real *, integer *, real *, real *, real *, + integer *, real *, real *, integer *, real *, integer *, real *, + integer *, real *, integer *, integer *, integer *, integer *), xerbla_(char *, integer *); + integer mlwdmd, olwdmd; + extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + *, real *, real *, integer *, integer *); + logical sccolx, sccoly; + extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + integer *, real *, integer *), slaset_(char *, integer *, + integer *, real *, real *, real *, integer *); + integer iminwr; + logical wntvec, wntvcf; + integer mlwgqr; + logical wntref; + integer mlwork, olwgqr, olwork; + real rdummy[2]; + integer mlwmqr, olwmqr; + logical lquery, wntres, wnttrf, wntvcq; + extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + *, integer *, real *, real *, integer *, integer *), sormqr_(char + *, char *, integer *, integer *, integer *, real *, integer *, + real *, real *, integer *, real *, integer *, integer *); + real one; + +/* March 2023 */ +/* ..... */ +/* USE iso_fortran_env */ +/* INTEGER, PARAMETER :: WP = real32 */ +/* ..... */ +/* Scalar arguments */ +/* Array arguments */ +/* ..... */ +/* Purpose */ +/* ======= */ +/* SGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */ +/* a pair of data snapshot matrices, using a QR factorization */ +/* based compression of the data. For the input matrices */ +/* X and Y such that Y = A*X with an unaccessible matrix */ +/* A, SGEDMDQ computes a certain number of Ritz pairs of A using */ +/* the standard Rayleigh-Ritz extraction from a subspace of */ +/* range(X) that is determined using the leading left singular */ +/* vectors of X. Optionally, SGEDMDQ returns the residuals */ +/* of the computed Ritz pairs, the information needed for */ +/* a refinement of the Ritz vectors, or the eigenvectors of */ +/* the Exact DMD. */ +/* For further details see the references listed */ +/* below. For more details of the implementation see [3]. */ + +/* References */ +/* ========== */ +/* [1] P. Schmid: Dynamic mode decomposition of numerical */ +/* and experimental data, */ +/* Journal of Fluid Mechanics 656, 5-28, 2010. */ +/* [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */ +/* decompositions: analysis and enhancements, */ +/* SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */ +/* [3] Z. Drmac: A LAPACK implementation of the Dynamic */ +/* Mode Decomposition I. Technical report. AIMDyn Inc. */ +/* and LAPACK Working Note 298. */ +/* [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */ +/* Brunton, N. Kutz: On Dynamic Mode Decomposition: */ +/* Theory and Applications, Journal of Computational */ +/* Dynamics 1(2), 391 -421, 2014. */ + +/* Developed and supported by: */ +/* =========================== */ +/* Developed and coded by Zlatko Drmac, Faculty of Science, */ +/* University of Zagreb; drmac@math.hr */ +/* In cooperation with */ +/* AIMdyn Inc., Santa Barbara, CA. */ +/* and supported by */ +/* - DARPA SBIR project "Koopman Operator-Based Forecasting */ +/* for Nonstationary Processes from Near-Term, Limited */ +/* Observational Data" Contract No: W31P4Q-21-C-0007 */ +/* - DARPA PAI project "Physics-Informed Machine Learning */ +/* Methodologies" Contract No: HR0011-18-9-0033 */ +/* - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */ +/* Framework for Space-Time Analysis of Process Dynamics" */ +/* Contract No: HR0011-16-C-0116 */ +/* Any opinions, findings and conclusions or recommendations */ +/* expressed in this material are those of the author and */ +/* do not necessarily reflect the views of the DARPA SBIR */ +/* Program Office. */ +/* ============================================================ */ +/* Distribution Statement A: */ +/* Approved for Public Release, Distribution Unlimited. */ +/* Cleared by DARPA on September 29, 2022 */ +/* ============================================================ */ +/* ...................................................................... */ +/* Arguments */ +/* ========= */ +/* JOBS (input) CHARACTER*1 */ +/* Determines whether the initial data snapshots are scaled */ +/* by a diagonal matrix. The data snapshots are the columns */ +/* of F. The leading N-1 columns of F are denoted X and the */ +/* trailing N-1 columns are denoted Y. */ +/* 'S' :: The data snapshots matrices X and Y are multiplied */ +/* with a diagonal matrix D so that X*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'C' :: The snapshots are scaled as with the 'S' option. */ +/* If it is found that an i-th column of X is zero */ +/* vector and the corresponding i-th column of Y is */ +/* non-zero, then the i-th column of Y is set to */ +/* zero and a warning flag is raised. */ +/* 'Y' :: The data snapshots matrices X and Y are multiplied */ +/* by a diagonal matrix D so that Y*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'N' :: No data scaling. */ +/* ..... */ +/* JOBZ (input) CHARACTER*1 */ +/* Determines whether the eigenvectors (Koopman modes) will */ +/* be computed. */ +/* 'V' :: The eigenvectors (Koopman modes) will be computed */ +/* and returned in the matrix Z. */ +/* See the description of Z. */ +/* 'F' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product Z*V, where Z */ +/* is orthonormal and V contains the eigenvectors */ +/* of the corresponding Rayleigh quotient. */ +/* See the descriptions of F, V, Z. */ +/* 'Q' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product Q*Z, where Z */ +/* contains the eigenvectors of the compression of the */ +/* underlying discretized operator onto the span of */ +/* the data snapshots. See the descriptions of F, V, Z. */ +/* Q is from the initial QR factorization. */ +/* 'N' :: The eigenvectors are not computed. */ +/* ..... */ +/* JOBR (input) CHARACTER*1 */ +/* Determines whether to compute the residuals. */ +/* 'R' :: The residuals for the computed eigenpairs will */ +/* be computed and stored in the array RES. */ +/* See the description of RES. */ +/* For this option to be legal, JOBZ must be 'V'. */ +/* 'N' :: The residuals are not computed. */ +/* ..... */ +/* JOBQ (input) CHARACTER*1 */ +/* Specifies whether to explicitly compute and return the */ +/* orthogonal matrix from the QR factorization. */ +/* 'Q' :: The matrix Q of the QR factorization of the data */ +/* snapshot matrix is computed and stored in the */ +/* array F. See the description of F. */ +/* 'N' :: The matrix Q is not explicitly computed. */ +/* ..... */ +/* JOBT (input) CHARACTER*1 */ +/* Specifies whether to return the upper triangular factor */ +/* from the QR factorization. */ +/* 'R' :: The matrix R of the QR factorization of the data */ +/* snapshot matrix F is returned in the array Y. */ +/* See the description of Y and Further details. */ +/* 'N' :: The matrix R is not returned. */ +/* ..... */ +/* JOBF (input) CHARACTER*1 */ +/* Specifies whether to store information needed for post- */ +/* processing (e.g. computing refined Ritz vectors) */ +/* 'R' :: The matrix needed for the refinement of the Ritz */ +/* vectors is computed and stored in the array B. */ +/* See the description of B. */ +/* 'E' :: The unscaled eigenvectors of the Exact DMD are */ +/* computed and returned in the array B. See the */ +/* description of B. */ +/* 'N' :: No eigenvector refinement data is computed. */ +/* To be useful on exit, this option needs JOBQ='Q'. */ +/* ..... */ +/* WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */ +/* Allows for a selection of the SVD algorithm from the */ +/* LAPACK library. */ +/* 1 :: SGESVD (the QR SVD algorithm) */ +/* 2 :: SGESDD (the Divide and Conquer algorithm; if enough */ +/* workspace available, this is the fastest option) */ +/* 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 */ +/* are the most accurate options) */ +/* 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 */ +/* are the most accurate options) */ +/* For the four methods above, a significant difference in */ +/* the accuracy of small singular values is possible if */ +/* the snapshots vary in norm so that X is severely */ +/* ill-conditioned. If small (smaller than EPS*||X||) */ +/* singular values are of interest and JOBS=='N', then */ +/* the options (3, 4) give the most accurate results, where */ +/* the option 4 is slightly better and with stronger */ +/* theoretical background. */ +/* If JOBS=='S', i.e. the columns of X will be normalized, */ +/* then all methods give nearly equally accurate results. */ +/* ..... */ +/* M (input) INTEGER, M >= 0 */ +/* The state space dimension (the number of rows of F) */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshots from a single trajectory, */ +/* taken at equidistant discrete times. This is the */ +/* number of columns of F. */ +/* ..... */ +/* F (input/output) REAL(KIND=WP) M-by-N array */ +/* > On entry, */ +/* the columns of F are the sequence of data snapshots */ +/* from a single trajectory, taken at equidistant discrete */ +/* times. It is assumed that the column norms of F are */ +/* in the range of the normalized floating point numbers. */ +/* < On exit, */ +/* If JOBQ == 'Q', the array F contains the orthogonal */ +/* matrix/factor of the QR factorization of the initial */ +/* data snapshots matrix F. See the description of JOBQ. */ +/* If JOBQ == 'N', the entries in F strictly below the main */ +/* diagonal contain, column-wise, the information on the */ +/* Householder vectors, as returned by SGEQRF. The */ +/* remaining information to restore the orthogonal matrix */ +/* of the initial QR factorization is stored in WORK(1:N). */ +/* See the description of WORK. */ +/* ..... */ +/* LDF (input) INTEGER, LDF >= M */ +/* The leading dimension of the array F. */ +/* ..... */ +/* X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* X is used as workspace to hold representations of the */ +/* leading N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, the leading K columns of X contain the leading */ +/* K left singular vectors of the above described content */ +/* of X. To lift them to the space of the left singular */ +/* vectors U(:,1:K)of the input data, pre-multiply with the */ +/* Q factor from the initial QR factorization. */ +/* See the descriptions of F, K, V and Z. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= N */ +/* The leading dimension of the array X */ +/* ..... */ +/* Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* Y is used as workspace to hold representations of the */ +/* trailing N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, */ +/* If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */ +/* triangular factor from the QR factorization of the data */ +/* snapshot matrix F. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= N */ +/* The leading dimension of the array Y */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N-1 :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the description of K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the SVD/POD basis for the leading N-1 */ +/* data snapshots (columns of F) and the number of the */ +/* computed Ritz pairs. The value of K is determined */ +/* according to the rule set by the parameters NRNK and */ +/* TOL. See the descriptions of NRNK and TOL. */ +/* ..... */ +/* REIG (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K<=N) entries of REIG contain */ +/* the real parts of the computed eigenvalues */ +/* REIG(1:K) + sqrt(-1)*IMEIG(1:K). */ +/* See the descriptions of K, IMEIG, Z. */ +/* ..... */ +/* IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K0, then the corresponding complex */ +/* conjugate pair of eigenvalues reads */ +/* LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) */ +/* LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */ +/* That is, complex conjugate pairs have consecutive */ +/* indices (i,i+1), with the positive imaginary part */ +/* listed first. */ +/* See the descriptions of K, REIG, Z. */ +/* ..... */ +/* Z (workspace/output) REAL(KIND=WP) M-by-(N-1) array */ +/* If JOBZ =='V' then */ +/* Z contains real Ritz vectors as follows: */ +/* If IMEIG(i)=0, then Z(:,i) is an eigenvector of */ +/* the i-th Ritz value. */ +/* If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */ +/* [Z(:,i) Z(:,i+1)] span an invariant subspace and */ +/* the Ritz values extracted from this subspace are */ +/* REIG(i) + sqrt(-1)*IMEIG(i) and */ +/* REIG(i) - sqrt(-1)*IMEIG(i). */ +/* The corresponding eigenvectors are */ +/* Z(:,i) + sqrt(-1)*Z(:,i+1) and */ +/* Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */ +/* If JOBZ == 'F', then the above descriptions hold for */ +/* the columns of Z*V, where the columns of V are the */ +/* eigenvectors of the K-by-K Rayleigh quotient, and Z is */ +/* orthonormal. The columns of V are similarly structured: */ +/* If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if */ +/* IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and */ +/* Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) */ +/* are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */ +/* See the descriptions of REIG, IMEIG, X and V. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs. */ +/* If LAMBDA(i) is real, then */ +/* RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */ +/* If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */ +/* then */ +/* RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */ +/* where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */ +/* [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */ +/* It holds that */ +/* RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 */ +/* RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */ +/* where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) */ +/* ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) */ +/* See the description of Z. */ +/* ..... */ +/* B (output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array. */ +/* IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:N,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* In both cases, the content of B can be lifted to the */ +/* original dimension of the input data by pre-multiplying */ +/* with the Q factor from the initial QR factorization. */ +/* Here A denotes a compression of the underlying operator. */ +/* See the descriptions of F and X. */ +/* If JOBF =='N', then B is not referenced. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= MIN(M,N) */ +/* The leading dimension of the array B. */ +/* ..... */ +/* V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array */ +/* On exit, V(1:K,1:K) contains the K eigenvectors of */ +/* the Rayleigh quotient. The eigenvectors of a complex */ +/* conjugate pair of eigenvalues are returned in real form */ +/* as explained in the description of Z. The Ritz vectors */ +/* (returned in Z) are the product of X and V; see */ +/* the descriptions of X and Z. */ +/* ..... */ +/* LDV (input) INTEGER, LDV >= N-1 */ +/* The leading dimension of the array V. */ +/* ..... */ +/* S (output) REAL(KIND=WP) (N-1)-by-(N-1) array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by SGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N-1 */ +/* The leading dimension of the array S. */ +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* WORK(1:MIN(M,N)) contains the scalar factors of the */ +/* elementary reflectors as returned by SGEQRF of the */ +/* M-by-N input matrix F. */ +/* WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of */ +/* the input submatrix F(1:M,1:N-1). */ +/* If the call to SGEDMDQ is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. Hence, the */ +/* length of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is calculated as follows: */ +/* Let MLWQR = N (minimal workspace for SGEQRF[M,N]) */ +/* MLWDMD = minimal workspace for SGEDMD (see the */ +/* description of LWORK in SGEDMD) for */ +/* snapshots of dimensions MIN(M,N)-by-(N-1) */ +/* MLWMQR = N (minimal workspace for */ +/* SORMQR['L','N',M,N,N]) */ +/* MLWGQR = N (minimal workspace for SORGQR[M,N,N]) */ +/* Then */ +/* LWORK = MAX(N+MLWQR, N+MLWDMD) */ +/* is updated as follows: */ +/* if JOBZ == 'V' or JOBZ == 'F' THEN */ +/* LWORK = MAX( LWORK,MIN(M,N)+N-1 +MLWMQR ) */ +/* if JOBQ == 'Q' THEN */ +/* LWORK = MAX( LWORK,MIN(M,N)+N-1+MLWGQR) */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* Let M1=MIN(M,N), N1=N-1. Then */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */ +/* If on entry LIWORK = -1, then a worskpace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ + +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* Local array */ +/* ~~~~~~~~~~~ */ + +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ + +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* External subroutines */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ + /* Parameter adjustments */ + f_dim1 = *ldf; + f_offset = 1 + f_dim1 * 1; + f -= f_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --reig; + --imeig; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + v_dim1 = *ldv; + v_offset = 1 + v_dim1 * 1; + v -= v_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --work; + --iwork; + + /* Function Body */ + one = 1.f; + zero = 0.f; +/* .......................................................... */ + +/* Test the input arguments */ + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntvcf = lsame_(jobz, "F"); + wntvcq = lsame_(jobz, "Q"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + wantq = lsame_(jobq, "Q"); + wnttrf = lsame_(jobt, "R"); + minmn = f2cmin(*m,*n); + *info = 0; + lquery = *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && lsame_(jobz, "N")) { + *info = -3; + } else if (! (wantq || lsame_(jobq, "N"))) { + *info = -4; + } else if (! (wnttrf || lsame_(jobt, "N"))) { + *info = -5; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -6; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -7; + } else if (*m < 0) { + *info = -8; + } else if (*n < 0 || *n > *m + 1) { + *info = -9; + } else if (*ldf < *m) { + *info = -11; + } else if (*ldx < minmn) { + *info = -13; + } else if (*ldy < minmn) { + *info = -15; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -16; + } else if (*tol < zero || *tol >= one) { + *info = -17; + } else if (*ldz < *m) { + *info = -22; + } else if ((wntref || wntex) && *ldb < minmn) { + *info = -25; + } else if (*ldv < *n - 1) { + *info = -27; + } else if (*lds < *n - 1) { + *info = -29; + } + + if (wntvec || wntvcf) { + *(unsigned char *)jobvl = 'V'; + } else { + *(unsigned char *)jobvl = 'N'; + } + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0 || *n == 1) { +/* All output except K is void. INFO=1 signals */ +/* the void input. In case of a workspace query, */ +/* the minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + work[1] = 2.f; + work[2] = 2.f; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlwqr = f2cmax(1,*n); +/* Minimal workspace length for SGEQRF. */ + mlwork = f2cmin(*m,*n) + mlwqr; + if (lquery) { + sgeqrf_(m, n, &f[f_offset], ldf, &work[1], rdummy, &c_n1, &info1); + olwqr = (integer) rdummy[0]; + olwork = f2cmin(*m,*n) + olwqr; + } + i__1 = *n - 1; + sgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], + ldx, &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], & + z__[z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], + ldv, &s[s_offset], lds, &work[1], &c_n1, &iwork[1], liwork, & + info1); + mlwdmd = (integer) work[1]; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwdmd; + mlwork = f2cmax(i__1,i__2); + iminwr = iwork[1]; + if (lquery) { + olwdmd = (integer) work[2]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwdmd; + olwork = f2cmax(i__1,i__2); + } + if (wntvec || wntvcf) { + mlwmqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + *n - 1 + mlwmqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + sormqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &work[1], & + z__[z_offset], ldz, &work[1], &c_n1, &info1); + olwmqr = (integer) work[1]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + *n - 1 + olwmqr; + olwork = f2cmax(i__1,i__2); + } + } + if (wantq) { + mlwgqr = *n; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + *n - 1 + mlwgqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + sorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[ + 1], &c_n1, &info1); + olwgqr = (integer) work[1]; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + *n - 1 + olwgqr; + olwork = f2cmax(i__1,i__2); + } + } + iminwr = f2cmax(1,iminwr); + mlwork = f2cmax(2,mlwork); + if (*lwork < mlwork && ! lquery) { + *info = -31; + } + if (*liwork < iminwr && ! lquery) { + *info = -33; + } + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGEDMDQ", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + work[1] = (real) mlwork; + work[2] = (real) olwork; + return 0; + } +/* ..... */ +/* Initial QR factorization that is used to represent the */ +/* snapshots as elements of lower dimensional subspace. */ +/* For large scale computation with M >>N , at this place */ +/* one can use an out of core QRF. */ + + i__1 = *lwork - minmn; + sgeqrf_(m, n, &f[f_offset], ldf, &work[1], &work[minmn + 1], &i__1, & + info1); + +/* Define X and Y as the snapshots representations in the */ +/* orthogonal basis computed in the QR factorization. */ +/* X corresponds to the leading N-1 and Y to the trailing */ +/* N-1 snapshots. */ + i__1 = *n - 1; + slaset_("L", &minmn, &i__1, &zero, &zero, &x[x_offset], ldx); + i__1 = *n - 1; + slacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx); + i__1 = *n - 1; + slacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy); + if (*m >= 3) { + i__1 = minmn - 2; + i__2 = *n - 2; + slaset_("L", &i__1, &i__2, &zero, &zero, &y[y_dim1 + 3], ldy); + } + +/* Compute the DMD of the projected snapshot pairs (X,Y) */ + i__1 = *n - 1; + i__2 = *lwork - minmn; + sgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx, + &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &z__[ + z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[ + s_offset], lds, &work[minmn + 1], &i__2, &iwork[1], liwork, & + info1); + if (info1 == 2 || info1 == 3) { +/* Return with error code. */ + *info = info1; + return 0; + } else { + *info = info1; + } + +/* The Ritz vectors (Koopman modes) can be explicitly */ +/* formed or returned in factored form. */ + if (wntvec) { +/* Compute the eigenvectors explicitly. */ + if (*m > minmn) { + i__1 = *m - minmn; + slaset_("A", &i__1, k, &zero, &zero, &z__[minmn + 1 + z_dim1], + ldz); + } + i__1 = *lwork - (minmn + *n - 1); + sormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[ + z_offset], ldz, &work[minmn + *n], &i__1, &info1); + } else if (wntvcf) { +/* Return the Ritz vectors (eigenvectors) in factored */ +/* form Z*V, where Z contains orthonormal matrix (the */ +/* product of Q from the initial QR factorization and */ +/* the SVD/POD_basis returned by SGEDMD in X) and the */ +/* second factor (the eigenvectors of the Rayleigh */ +/* quotient) is in the array V, as returned by SGEDMD. */ + slacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz); + if (*m > *n) { + i__1 = *m - *n; + slaset_("A", &i__1, k, &zero, &zero, &z__[*n + 1 + z_dim1], ldz); + } + i__1 = *lwork - (minmn + *n - 1); + sormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[ + z_offset], ldz, &work[minmn + *n], &i__1, &info1); + } + +/* Some optional output variables: */ + +/* The upper triangular factor in the initial QR */ +/* factorization is optionally returned in the array Y. */ +/* This is useful if this call to SGEDMDQ is to be */ +/* followed by a streaming DMD that is implemented in a */ +/* QR compressed form. */ + if (wnttrf) { +/* Return the upper triangular R in Y */ + slaset_("A", &minmn, n, &zero, &zero, &y[y_offset], ldy); + slacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy); + } + +/* The orthonormal/orthogonal factor in the initial QR */ +/* factorization is optionally returned in the array F. */ +/* Same as with the triangular factor above, this is */ +/* useful in a streaming DMD. */ + if (wantq) { +/* Q overwrites F */ + i__1 = *lwork - (minmn + *n - 1); + sorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[minmn + + *n], &i__1, &info1); + } + + return 0; + +} /* sgedmdq_ */ + diff --git a/lapack-netlib/SRC/zgedmd.c b/lapack-netlib/SRC/zgedmd.c index 447b23014..c1b39ba3e 100644 --- a/lapack-netlib/SRC/zgedmd.c +++ b/lapack-netlib/SRC/zgedmd.c @@ -509,3 +509,1168 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static integer c_n1 = -1; +static integer c__1 = 1; +static integer c__0 = 0; + +/* Subroutine */ int zgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, + integer *whtsvd, integer *m, integer *n, doublecomplex *x, integer * + ldx, doublecomplex *y, integer *ldy, integer *nrnk, doublereal *tol, + integer *k, doublecomplex *eigs, doublecomplex *z__, integer *ldz, + doublereal *res, doublecomplex *b, integer *ldb, doublecomplex *w, + integer *ldw, doublecomplex *s, integer *lds, doublecomplex *zwork, + integer *lzwork, doublereal *rwork, integer *lrwork, integer *iwork, + integer *liwork, integer *info) +{ + /* System generated locals */ + integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, + b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2, i__3, + i__4, i__5; + doublereal d__1, d__2; + doublecomplex z__1, z__2; + + /* Local variables */ + doublecomplex zone; + doublereal zero, ssum; + integer info1, info2; + doublereal xscl1, xscl2; + integer i__, j; + doublereal scale; + extern logical lsame_(char *, char *); + logical badxy; + doublereal small; + extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + char jobzl[1]; + extern /* Subroutine */ int zgeev_(char *, char *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + doublereal *, integer *); + logical wntex; + doublecomplex zzero; + extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *); + extern doublereal dznrm2_(integer *, doublecomplex *, integer *), dlamch_( + char *); + extern logical disnan_(doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + char t_or_n__[1]; + extern /* Subroutine */ int zdscal_(integer *, doublereal *, + doublecomplex *, integer *), zgesdd_(char *, integer *, integer *, + doublecomplex *, integer *, doublereal *, doublecomplex *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + doublereal *, integer *, integer *), zlascl_(char *, + integer *, integer *, doublereal *, doublereal *, integer *, + integer *, doublecomplex *, integer *, integer *); + extern integer izamax_(integer *, doublecomplex *, integer *); + logical sccolx, sccoly; + integer lwrsdd, mwrsdd; + extern /* Subroutine */ int zgesvd_(char *, char *, integer *, integer *, + doublecomplex *, integer *, doublereal *, doublecomplex *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + doublereal *, integer *), zlacpy_(char *, + integer *, integer *, doublecomplex *, integer *, doublecomplex *, + integer *); + integer iminwr; + logical wntref, wntvec; + doublereal rootsc; + integer lwrkev, mlwork, mwrkev, numrnk, olwork, lwrsvd, mwrsvd, mlrwrk; + logical lquery, wntres; + char jsvopt[1]; + integer lwrsvj, mwrsvj; + doublereal rdummy[2]; + extern /* Subroutine */ int zgejsv_(char *, char *, char *, char *, char * + , char *, integer *, integer *, doublecomplex *, integer *, + doublereal *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *, doublereal *, integer *, + integer *, integer *), zlassq_(integer *, doublecomplex *, integer *, + doublereal *, doublereal *), mecago_(); + integer lwrsvq, mwrsvq; + doublereal ofl, one; + extern /* Subroutine */ int zgesvdq_(char *, char *, char *, char *, char + *, integer *, integer *, doublecomplex *, integer *, doublereal *, + doublecomplex *, integer *, doublecomplex *, integer *, integer * + , integer *, integer *, doublecomplex *, integer *, doublereal *, + integer *, integer *); + +/* March 2023 */ +/* ..... */ +/* USE iso_fortran_env */ +/* INTEGER, PARAMETER :: WP = real64 */ +/* ..... */ +/* Scalar arguments */ +/* Array arguments */ +/* ............................................................ */ +/* Purpose */ +/* ======= */ +/* ZGEDMD computes the Dynamic Mode Decomposition (DMD) for */ +/* a pair of data snapshot matrices. For the input matrices */ +/* X and Y such that Y = A*X with an unaccessible matrix */ +/* A, ZGEDMD computes a certain number of Ritz pairs of A using */ +/* the standard Rayleigh-Ritz extraction from a subspace of */ +/* range(X) that is determined using the leading left singular */ +/* vectors of X. Optionally, ZGEDMD returns the residuals */ +/* of the computed Ritz pairs, the information needed for */ +/* a refinement of the Ritz vectors, or the eigenvectors of */ +/* the Exact DMD. */ +/* For further details see the references listed */ +/* below. For more details of the implementation see [3]. */ + +/* References */ +/* ========== */ +/* [1] P. Schmid: Dynamic mode decomposition of numerical */ +/* and experimental data, */ +/* Journal of Fluid Mechanics 656, 5-28, 2010. */ +/* [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */ +/* decompositions: analysis and enhancements, */ +/* SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */ +/* [3] Z. Drmac: A LAPACK implementation of the Dynamic */ +/* Mode Decomposition I. Technical report. AIMDyn Inc. */ +/* and LAPACK Working Note 298. */ +/* [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */ +/* Brunton, N. Kutz: On Dynamic Mode Decomposition: */ +/* Theory and Applications, Journal of Computational */ +/* Dynamics 1(2), 391 -421, 2014. */ + +/* ...................................................................... */ +/* Developed and supported by: */ +/* =========================== */ +/* Developed and coded by Zlatko Drmac, Faculty of Science, */ +/* University of Zagreb; drmac@math.hr */ +/* In cooperation with */ +/* AIMdyn Inc., Santa Barbara, CA. */ +/* and supported by */ +/* - DARPA SBIR project "Koopman Operator-Based Forecasting */ +/* for Nonstationary Processes from Near-Term, Limited */ +/* Observational Data" Contract No: W31P4Q-21-C-0007 */ +/* - DARPA PAI project "Physics-Informed Machine Learning */ +/* Methodologies" Contract No: HR0011-18-9-0033 */ +/* - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */ +/* Framework for Space-Time Analysis of Process Dynamics" */ +/* Contract No: HR0011-16-C-0116 */ +/* Any opinions, findings and conclusions or recommendations */ +/* expressed in this material are those of the author and */ +/* do not necessarily reflect the views of the DARPA SBIR */ +/* Program Office */ +/* ============================================================ */ +/* Distribution Statement A: */ +/* Approved for Public Release, Distribution Unlimited. */ +/* Cleared by DARPA on September 29, 2022 */ +/* ============================================================ */ +/* ............................................................ */ +/* Arguments */ +/* ========= */ +/* JOBS (input) CHARACTER*1 */ +/* Determines whether the initial data snapshots are scaled */ +/* by a diagonal matrix. */ +/* 'S' :: The data snapshots matrices X and Y are multiplied */ +/* with a diagonal matrix D so that X*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'C' :: The snapshots are scaled as with the 'S' option. */ +/* If it is found that an i-th column of X is zero */ +/* vector and the corresponding i-th column of Y is */ +/* non-zero, then the i-th column of Y is set to */ +/* zero and a warning flag is raised. */ +/* 'Y' :: The data snapshots matrices X and Y are multiplied */ +/* by a diagonal matrix D so that Y*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'N' :: No data scaling. */ +/* ..... */ +/* JOBZ (input) CHARACTER*1 */ +/* Determines whether the eigenvectors (Koopman modes) will */ +/* be computed. */ +/* 'V' :: The eigenvectors (Koopman modes) will be computed */ +/* and returned in the matrix Z. */ +/* See the description of Z. */ +/* 'F' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product X(:,1:K)*W, where X */ +/* contains a POD basis (leading left singular vectors */ +/* of the data matrix X) and W contains the eigenvectors */ +/* of the corresponding Rayleigh quotient. */ +/* See the descriptions of K, X, W, Z. */ +/* 'N' :: The eigenvectors are not computed. */ +/* ..... */ +/* JOBR (input) CHARACTER*1 */ +/* Determines whether to compute the residuals. */ +/* 'R' :: The residuals for the computed eigenpairs will be */ +/* computed and stored in the array RES. */ +/* See the description of RES. */ +/* For this option to be legal, JOBZ must be 'V'. */ +/* 'N' :: The residuals are not computed. */ +/* ..... */ +/* JOBF (input) CHARACTER*1 */ +/* Specifies whether to store information needed for post- */ +/* processing (e.g. computing refined Ritz vectors) */ +/* 'R' :: The matrix needed for the refinement of the Ritz */ +/* vectors is computed and stored in the array B. */ +/* See the description of B. */ +/* 'E' :: The unscaled eigenvectors of the Exact DMD are */ +/* computed and returned in the array B. See the */ +/* description of B. */ +/* 'N' :: No eigenvector refinement data is computed. */ +/* ..... */ +/* WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */ +/* Allows for a selection of the SVD algorithm from the */ +/* LAPACK library. */ +/* 1 :: ZGESVD (the QR SVD algorithm) */ +/* 2 :: ZGESDD (the Divide and Conquer algorithm; if enough */ +/* workspace available, this is the fastest option) */ +/* 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 */ +/* are the most accurate options) */ +/* 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 */ +/* are the most accurate options) */ +/* For the four methods above, a significant difference in */ +/* the accuracy of small singular values is possible if */ +/* the snapshots vary in norm so that X is severely */ +/* ill-conditioned. If small (smaller than EPS*||X||) */ +/* singular values are of interest and JOBS=='N', then */ +/* the options (3, 4) give the most accurate results, where */ +/* the option 4 is slightly better and with stronger */ +/* theoretical background. */ +/* If JOBS=='S', i.e. the columns of X will be normalized, */ +/* then all methods give nearly equally accurate results. */ +/* ..... */ +/* M (input) INTEGER, M>= 0 */ +/* The state space dimension (the row dimension of X, Y). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshot pairs */ +/* (the number of columns of X and Y). */ +/* ..... */ +/* X (input/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, X contains the data snapshot matrix X. It is */ +/* assumed that the column norms of X are in the range of */ +/* the normalized floating point numbers. */ +/* < On exit, the leading K columns of X contain a POD basis, */ +/* i.e. the leading K left singular vectors of the input */ +/* data matrix X, U(:,1:K). All N columns of X contain all */ +/* left singular vectors of the input matrix X. */ +/* See the descriptions of K, Z and W. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= M */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, Y contains the data snapshot matrix Y */ +/* < On exit, */ +/* If JOBR == 'R', the leading K columns of Y contain */ +/* the residual vectors for the computed Ritz pairs. */ +/* See the description of RES. */ +/* If JOBR == 'N', Y contains the original input data, */ +/* scaled according to the value of JOBS. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= M */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the descriptions of TOL and K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the POD basis for the data snapshot */ +/* matrix X and the number of the computed Ritz pairs. */ +/* The value of K is determined according to the rule set */ +/* by the parameters NRNK and TOL. */ +/* See the descriptions of NRNK and TOL. */ +/* ..... */ +/* EIGS (output) COMPLEX(KIND=WP) N-by-1 array */ +/* The leading K (K<=N) entries of EIGS contain */ +/* the computed eigenvalues (Ritz values). */ +/* See the descriptions of K, and Z. */ +/* ..... */ +/* Z (workspace/output) COMPLEX(KIND=WP) M-by-N array */ +/* If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */ +/* is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */ +/* the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) */ +/* is an eigenvector corresponding to EIGS(i). The columns */ +/* of W(1:k,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. */ +/* See the descriptions of EIGS, X and W. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) N-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs, */ +/* RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */ +/* See the description of EIGS and Z. */ +/* ..... */ +/* B (output) COMPLEX(KIND=WP) M-by-N array. */ +/* IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:M,1:K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* If JOBF =='N', then B is not referenced. */ +/* See the descriptions of X, W, K. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= M */ +/* The leading dimension of the array B. */ +/* ..... */ +/* W (workspace/output) COMPLEX(KIND=WP) N-by-N array */ +/* On exit, W(1:K,1:K) contains the K computed */ +/* eigenvectors of the matrix Rayleigh quotient. */ +/* The Ritz vectors (returned in Z) are the */ +/* product of X (containing a POD basis for the input */ +/* matrix X) and W. See the descriptions of K, S, X and Z. */ +/* W is also used as a workspace to temporarily store the */ +/* right singular vectors of X. */ +/* ..... */ +/* LDW (input) INTEGER, LDW >= N */ +/* The leading dimension of the array W. */ +/* ..... */ +/* S (workspace/output) COMPLEX(KIND=WP) N-by-N array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by ZGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N */ +/* The leading dimension of the array S. */ +/* ..... */ +/* ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array */ +/* ZWORK is used as complex workspace in the complex SVD, as */ +/* specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing */ +/* the eigenvalues of a Rayleigh quotient. */ +/* If the call to ZGEDMD is only workspace query, then */ +/* ZWORK(1) contains the minimal complex workspace length and */ +/* ZWORK(2) is the optimal complex workspace length. */ +/* Hence, the length of work is at least 2. */ +/* See the description of LZWORK. */ +/* ..... */ +/* LZWORK (input) INTEGER */ +/* The minimal length of the workspace vector ZWORK. */ +/* LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), */ +/* where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal */ +/* LZWORK_SVD is calculated as follows */ +/* If WHTSVD == 1 :: ZGESVD :: */ +/* LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */ +/* If WHTSVD == 2 :: ZGESDD :: */ +/* LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) */ +/* If WHTSVD == 3 :: ZGESVDQ :: */ +/* LZWORK_SVD = obtainable by a query */ +/* If WHTSVD == 4 :: ZGEJSV :: */ +/* LZWORK_SVD = obtainable by a query */ +/* If on entry LZWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths and returns them in */ +/* LZWORK(1) and LZWORK(2), respectively. */ +/* ..... */ +/* RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array */ +/* On exit, RWORK(1:N) contains the singular values of */ +/* X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */ +/* If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain */ +/* scaling factor RWORK(N+2)/RWORK(N+1) used to scale X */ +/* and Y to avoid overflow in the SVD of X. */ +/* This may be of interest if the scaling option is off */ +/* and as many as possible smallest eigenvalues are */ +/* desired to the highest feasible accuracy. */ +/* If the call to ZGEDMD is only workspace query, then */ +/* RWORK(1) contains the minimal workspace length. */ +/* See the description of LRWORK. */ +/* ..... */ +/* LRWORK (input) INTEGER */ +/* The minimal length of the workspace vector RWORK. */ +/* LRWORK is calculated as follows: */ +/* LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where */ +/* LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace */ +/* for the SVD subroutine determined by the input parameter */ +/* WHTSVD. */ +/* If WHTSVD == 1 :: ZGESVD :: */ +/* LRWORK_SVD = 5*MIN(M,N) */ +/* If WHTSVD == 2 :: ZGESDD :: */ +/* LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), */ +/* 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) */ +/* If WHTSVD == 3 :: ZGESVDQ :: */ +/* LRWORK_SVD = obtainable by a query */ +/* If WHTSVD == 4 :: ZGEJSV :: */ +/* LRWORK_SVD = obtainable by a query */ +/* If on entry LRWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* real workspace length and returns it in RWORK(1). */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for ZWORK, RWORK and */ +/* IWORK. See the descriptions of ZWORK, RWORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* Local arrays */ +/* ~~~~~~~~~~~~ */ +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* ............................................................ */ + /* Parameter adjustments */ + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --eigs; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + w_dim1 = *ldw; + w_offset = 1 + w_dim1 * 1; + w -= w_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --zwork; + --rwork; + --iwork; + + /* Function Body */ + zero = 0.f; + one = 1.f; + zzero.r = 0.f, zzero.i = 0.f; + zone.r = 1.f, zone.i = 0.f; + +/* Test the input arguments */ + + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + *info = 0; + lquery = *lzwork == -1 || *liwork == -1 || *lrwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || lsame_(jobz, "N") || lsame_( + jobz, "F"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && ! wntvec) { + *info = -3; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -4; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -5; + } else if (*m < 0) { + *info = -6; + } else if (*n < 0 || *n > *m) { + *info = -7; + } else if (*ldx < *m) { + *info = -9; + } else if (*ldy < *m) { + *info = -11; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -12; + } else if (*tol < zero || *tol >= one) { + *info = -13; + } else if (*ldz < *m) { + *info = -17; + } else if ((wntref || wntex) && *ldb < *m) { + *info = -20; + } else if (*ldw < *n) { + *info = -22; + } else if (*lds < *n) { + *info = -24; + } + + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0) { +/* Quick return. All output except K is void. */ +/* INFO=1 signals the void input. */ +/* In case of a workspace query, the default */ +/* minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + rwork[1] = 1.; + zwork[1].r = 2., zwork[1].i = 0.; + zwork[2].r = 2., zwork[2].i = 0.; + } else { + *k = 0; + } + *info = 1; + return 0; + } + iminwr = 1; + mlrwrk = f2cmax(1,*n); + mlwork = 2; + olwork = 2; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of ZGESVD: */ +/* MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */ +/* Computing MAX */ + i__1 = 1, i__2 = (f2cmin(*m,*n) << 1) + f2cmax(*m,*n); + mwrsvd = f2cmax(i__1,i__2); + mlwork = f2cmax(mlwork,mwrsvd); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + f2cmin(*m,*n) * 5; + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + zgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[ + b_offset], ldb, &w[w_offset], ldw, &zwork[1], &c_n1, + rdummy, &info1); + lwrsvd = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvd); + } + } else if (*whtsvd == 2) { +/* The following is specified as the minimal */ +/* length of WORK in the definition of ZGESDD: */ +/* MWRSDD = 2*f2cmin(M,N)*f2cmin(M,N)+2*f2cmin(M,N)+f2cmax(M,N). */ +/* RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) */ +/* In LAPACK 3.10.1 RWORK is defined differently. */ +/* Below we take f2cmax over the two versions. */ +/* IMINWR = 8*MIN(M,N) */ + mwrsdd = (f2cmin(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) << 1) + f2cmax( + *m,*n); + mlwork = f2cmax(mlwork,mwrsdd); + iminwr = f2cmin(*m,*n) << 3; +/* Computing MAX */ +/* Computing MAX */ + i__3 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 7, i__4 = f2cmin(* + m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 5, i__3 = f2cmax(i__3, + i__4), i__4 = (f2cmax(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) + << 1) * f2cmin(*m,*n) + f2cmin(*m,*n); + i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + zgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], + ldb, &w[w_offset], ldw, &zwork[1], &c_n1, rdummy, & + iwork[1], &info1); +/* Computing MAX */ + i__1 = mwrsdd, i__2 = (integer) zwork[1].r; + lwrsdd = f2cmax(i__1,i__2); +/* Possible bug in ZGESDD optimal workspace size. */ + olwork = f2cmax(olwork,lwrsdd); + } + } else if (*whtsvd == 3) { + zgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[ + 1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, & + iwork[1], &c_n1, &zwork[1], &c_n1, rdummy, &c_n1, &info1); + iminwr = iwork[1]; + mwrsvq = (integer) zwork[2].r; + mlwork = f2cmax(mlwork,mwrsvq); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + (integer) rdummy[0]; + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + lwrsvq = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvq); + } + } else if (*whtsvd == 4) { + *(unsigned char *)jsvopt = 'J'; + zgejsv_("F", "U", jsvopt, "R", "N", "P", m, n, &x[x_offset], ldx, + &rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[ + 1], &c_n1, rdummy, &c_n1, &iwork[1], &info1); + iminwr = iwork[1]; + mwrsvj = (integer) zwork[2].r; + mlwork = f2cmax(mlwork,mwrsvj); +/* Computing MAX */ +/* Computing MAX */ + i__3 = 7, i__4 = (integer) rdummy[0]; + i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + lwrsvj = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrsvj); + } +/* END SELECT */ + } + if (wntvec || wntex || lsame_(jobz, "F")) { + *(unsigned char *)jobzl = 'V'; + } else { + *(unsigned char *)jobzl = 'N'; + } +/* Workspace calculation to the ZGEEV call */ +/* Computing MAX */ + i__1 = 1, i__2 = *n << 1; + mwrkev = f2cmax(i__1,i__2); + mlwork = f2cmax(mlwork,mwrkev); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = *n + (*n << 1); + mlrwrk = f2cmax(i__1,i__2); + if (lquery) { + zgeev_("N", jobzl, n, &s[s_offset], lds, &eigs[1], &w[w_offset], + ldw, &w[w_offset], ldw, &zwork[1], &c_n1, &rwork[1], & + info1); + lwrkev = (integer) zwork[1].r; + olwork = f2cmax(olwork,lwrkev); + } + + if (*liwork < iminwr && ! lquery) { + *info = -30; + } + if (*lrwork < mlrwrk && ! lquery) { + *info = -28; + } + if (*lzwork < mlwork && ! lquery) { + *info = -26; + } + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGEDMD", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + rwork[1] = (doublereal) mlrwrk; + zwork[1].r = (doublereal) mlwork, zwork[1].i = 0.; + zwork[2].r = (doublereal) olwork, zwork[2].i = 0.; + return 0; + } +/* ............................................................ */ + + ofl = dlamch_("O"); + small = dlamch_("S"); + badxy = FALSE_; + +/* <1> Optional scaling of the snapshots (columns of X, Y) */ +/* ========================================================== */ + if (sccolx) { +/* The columns of X will be normalized. */ +/* To prevent overflows, the column norms of X are */ +/* carefully computed using ZLASSQ. */ + *k = 0; + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* WORK(i) = DZNRM2( M, X(1,i), 1 ) */ + scale = zero; + zlassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum); + if (disnan_(&scale) || disnan_(&ssum)) { + *k = 0; + *info = -8; + i__2 = -(*info); + xerbla_("ZGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of X(:,i) overflows. First, X(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */ +/* Next, the norm of X(:,i) is stored without */ +/* overflow as RWORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of X(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + d__1 = one / rootsc; + zlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &x[ + i__ * x_dim1 + 1], ldx, &info2); + rwork[i__] = -scale * (rootsc / (doublereal) (*m)); + } else { +/* X(:,i) will be scaled to unit 2-norm */ + rwork[i__] = scale * rootsc; + zlascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, & + x[i__ * x_dim1 + 1], ldx, &info2); +/* X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC */ +/* LAPACK CALL */ + } + } else { + rwork[i__] = zero; + ++(*k); + } + } + if (*k == *n) { +/* All columns of X are zero. Return error code -8. */ +/* (the 8th input variable had an illegal value) */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("ZGEDMD", &i__1); + return 0; + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of Y. */ + if (rwork[i__] > zero) { + d__1 = one / rwork[i__]; + zdscal_(m, &d__1, &y[i__ * y_dim1 + 1], &c__1); +/* Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (rwork[i__] < zero) { + d__1 = -rwork[i__]; + d__2 = one / (doublereal) (*m); + zlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &y[i__ * + y_dim1 + 1], ldy, &info2); +/* LAPACK C */ + } else if (z_abs(&y[izamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ + * y_dim1]) != zero) { +/* X(:,i) is zero vector. For consistency, */ +/* Y(:,i) should also be zero. If Y(:,i) is not */ +/* zero, then the data might be inconsistent or */ +/* corrupted. If JOBS == 'C', Y(:,i) is set to */ +/* zero and a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + if (lsame_(jobs, "C")) { + zdscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1); + } +/* BLAS CALL */ + } + } + } + + if (sccoly) { +/* The columns of Y will be normalized. */ +/* To prevent overflows, the column norms of Y are */ +/* carefully computed using ZLASSQ. */ + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* RWORK(i) = DZNRM2( M, Y(1,i), 1 ) */ + scale = zero; + zlassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum); + if (disnan_(&scale) || disnan_(&ssum)) { + *k = 0; + *info = -10; + i__2 = -(*info); + xerbla_("ZGEDMD", &i__2); + } + if (scale != zero && ssum != zero) { + rootsc = sqrt(ssum); + if (scale >= ofl / rootsc) { +/* Norm of Y(:,i) overflows. First, Y(:,i) */ +/* is scaled by */ +/* ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */ +/* Next, the norm of Y(:,i) is stored without */ +/* overflow as RWORK(i) = - SCALE * (ROOTSC/M), */ +/* the minus sign indicating the 1/M factor. */ +/* Scaling is performed without overflow, and */ +/* underflow may occur in the smallest entries */ +/* of Y(:,i). The relative backward and forward */ +/* errors are small in the ell_2 norm. */ + d__1 = one / rootsc; + zlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &y[ + i__ * y_dim1 + 1], ldy, &info2); + rwork[i__] = -scale * (rootsc / (doublereal) (*m)); + } else { +/* Y(:,i) will be scaled to unit 2-norm */ + rwork[i__] = scale * rootsc; + zlascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, & + y[i__ * y_dim1 + 1], ldy, &info2); +/* Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i) ! INTRINSIC */ +/* LAPAC */ + } + } else { + rwork[i__] = zero; + } + } + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { +/* Now, apply the same scaling to the columns of X. */ + if (rwork[i__] > zero) { + d__1 = one / rwork[i__]; + zdscal_(m, &d__1, &x[i__ * x_dim1 + 1], &c__1); +/* X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i) ! INTRINSIC */ +/* BLAS CALL */ + } else if (rwork[i__] < zero) { + d__1 = -rwork[i__]; + d__2 = one / (doublereal) (*m); + zlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &x[i__ * + x_dim1 + 1], ldx, &info2); +/* LAPACK C */ + } else if (z_abs(&x[izamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ + * x_dim1]) != zero) { +/* Y(:,i) is zero vector. If X(:,i) is not */ +/* zero, then a warning flag is raised. */ +/* The computation continues but the */ +/* situation will be reported in the output. */ + badxy = TRUE_; + } + } + } + +/* <2> SVD of the data snapshot matrix X. */ +/* ===================================== */ +/* The left singular vectors are stored in the array X. */ +/* The right singular vectors are in the array W. */ +/* The array W will later on contain the eigenvectors */ +/* of a Rayleigh quotient. */ + numrnk = *n; +/* SELECT CASE ( WHTSVD ) */ + if (*whtsvd == 1) { + zgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], + ldb, &w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], & + info1); +/* LA */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 2) { + zgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], ldb, & + w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &iwork[1] + , &info1); +/* LAP */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 3) { + i__1 = *lrwork - *n; + zgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[1], + &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], + liwork, &zwork[1], lzwork, &rwork[*n + 1], &i__1, &info1); +/* LAPACK CA */ + zlacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK C */ + *(unsigned char *)t_or_n__ = 'C'; + } else if (*whtsvd == 4) { + i__1 = *lrwork - *n; + zgejsv_("F", "U", jsvopt, "R", "N", "P", m, n, &x[x_offset], ldx, & + rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[1], + lzwork, &rwork[*n + 1], &i__1, &iwork[1], &info1); + zlacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx); +/* LAPACK CALL */ + *(unsigned char *)t_or_n__ = 'N'; + xscl1 = rwork[*n + 1]; + xscl2 = rwork[*n + 2]; + if (xscl1 != xscl2) { +/* This is an exceptional situation. If the */ +/* data matrices are not scaled and the */ +/* largest singular value of X overflows. */ +/* In that case ZGEJSV can return the SVD */ +/* in scaled form. The scaling factor can be used */ +/* to rescale the data (X and Y). */ + zlascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], + ldy, &info2); + } +/* END SELECT */ + } + + if (info1 > 0) { +/* The SVD selected subroutine did not converge. */ +/* Return with an error code. */ + *info = 2; + return 0; + } + + if (rwork[1] == zero) { +/* The largest computed singular value of (scaled) */ +/* X is zero. Return error code -8 */ +/* (the 8th input variable had an illegal value). */ + *k = 0; + *info = -8; + i__1 = -(*info); + xerbla_("ZGEDMD", &i__1); + return 0; + } + +/* <3> Determine the numerical rank of the data */ +/* snapshots matrix X. This depends on the */ +/* parameters NRNK and TOL. */ +/* SELECT CASE ( NRNK ) */ + if (*nrnk == -1) { + *k = 1; + i__1 = numrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (rwork[i__] <= rwork[1] * *tol || rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else if (*nrnk == -2) { + *k = 1; + i__1 = numrnk - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + if (rwork[i__ + 1] <= rwork[i__] * *tol || rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } + } else { + *k = 1; + i__1 = *nrnk; + for (i__ = 2; i__ <= i__1; ++i__) { + if (rwork[i__] <= small) { + myexit_(); + } + ++(*k); + } +/* END SELECT */ + } +/* Now, U = X(1:M,1:K) is the SVD/POD basis for the */ +/* snapshot data in the input matrix X. */ +/* <4> Compute the Rayleigh quotient S = U^H * A * U. */ +/* Depending on the requested outputs, the computation */ +/* is organized to compute additional auxiliary */ +/* matrices (for the residuals and refinements). */ + +/* In all formulas below, we need V_k*Sigma_k^(-1) */ +/* where either V_k is in W(1:N,1:K), or V_k^H is in */ +/* W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */ + if (lsame_(t_or_n__, "N")) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + d__1 = one / rwork[i__]; + zdscal_(n, &d__1, &w[i__ * w_dim1 + 1], &c__1); +/* W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i) ! INTRINSIC */ +/* BLAS CALL */ + } + } else { +/* This non-unit stride access is due to the fact */ +/* that ZGESVD, ZGESVDQ and ZGESDD return the */ +/* adjoint matrix of the right singular vectors. */ +/* DO i = 1, K */ +/* CALL ZDSCAL( N, ONE/RWORK(i), W(i,1), LDW ) ! BLAS CALL */ +/* ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N) ! INTRINSIC */ +/* END DO */ + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + rwork[*n + i__] = one / rwork[i__]; + } + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *k; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * w_dim1; + i__4 = *n + i__; + z__2.r = rwork[i__4], z__2.i = zero; + i__5 = i__ + j * w_dim1; + z__1.r = z__2.r * w[i__5].r - z__2.i * w[i__5].i, z__1.i = + z__2.r * w[i__5].i + z__2.i * w[i__5].r; + w[i__3].r = z__1.r, w[i__3].i = z__1.i; + } + } + } + + if (wntref) { + +/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */ +/* for computing the refined Ritz vectors */ +/* (optionally, outside ZGEDMD). */ + zgemm_("N", t_or_n__, m, k, n, &zone, &y[y_offset], ldy, &w[w_offset], + ldw, &zzero, &z__[z_offset], ldz); +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! */ +/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K)) ! */ + +/* At this point Z contains */ +/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */ +/* this is needed for computing the residuals. */ +/* This matrix is returned in the array B and */ +/* it can be used to compute refined Ritz vectors. */ +/* BLA */ + zlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); +/* B(1:M,1:K) = Z(1:M,1:K) ! INTRINSIC */ +/* BLAS CALL */ + zgemm_("C", "N", k, k, m, &zone, &x[x_offset], ldx, &z__[z_offset], + ldz, &zzero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(TRANSPOSE(CONJG(X(1:M,1:K))),Z(1:M,1:K)) */ +/* At this point S = U^H * A * U is the Rayleigh quotient. */ +/* BLA */ + } else { +/* A * U(:,1:K) is not explicitly needed and the */ +/* computation is organized differently. The Rayleigh */ +/* quotient is computed more efficiently. */ + zgemm_("C", "N", k, n, m, &zone, &x[x_offset], ldx, &y[y_offset], ldy, + &zzero, &z__[z_offset], ldz); +/* Z(1:K,1:N) = MATMUL( TRANSPOSE(CONJG(X(1:M,1:K))), Y(1:M,1:N) */ + + zgemm_("N", t_or_n__, k, k, n, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! */ +/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K))) ! */ +/* At this point S = U^H * A * U is the Rayleigh quotient. */ +/* If the residuals are requested, save scaled V_k into Z. */ +/* Recall that V_k or V_k^H is stored in W. */ +/* BLAS */ + if (wntres || wntex) { + if (lsame_(t_or_n__, "N")) { + zlacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz); + } else { + zlacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz); + } + } + } + +/* <5> Compute the Ritz values and (if requested) the */ +/* right eigenvectors of the Rayleigh quotient. */ + + zgeev_("N", jobzl, k, &s[s_offset], lds, &eigs[1], &w[w_offset], ldw, &w[ + w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &info1); + +/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */ +/* quotient. See the description of Z. */ +/* Also, see the description of ZGEEV. */ +/* LAPACK CALL */ + if (info1 > 0) { +/* ZGEEV failed to compute the eigenvalues and */ +/* eigenvectors of the Rayleigh quotient. */ + *info = 3; + return 0; + } + +/* <6> Compute the eigenvectors (if requested) and, */ +/* the residuals (if requested). */ + + if (wntvec || wntex) { + if (wntres) { + if (wntref) { +/* Here, if the refinement is requested, we have */ +/* A*U(:,1:K) already computed and stored in Z. */ +/* For the residuals, need Y = A * U(:,1;K) * W. */ + zgemm_("N", "N", m, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &y[y_offset], ldy); +/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K) ! INTRINSIC */ +/* This frees Z; Y contains A * U(:,1:K) * W. */ +/* BLAS CALL */ + } else { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) (or its adjoint) is stored in Z */ + zgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + zgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[ + s_offset], lds, &zzero, &z__[z_offset], ldz); +/* Save a copy of Z into Y and free Z for holding */ +/* the Ritz vectors. */ + zlacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy); + if (wntex) { + zlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb); + } + } + } else if (wntex) { +/* Compute S = V_k * Sigma_k^(-1) * W, where */ +/* V_k * Sigma_k^(-1) is stored in Z */ + zgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[ + w_offset], ldw, &zzero, &s[s_offset], lds); +/* Then, compute Z = Y * S = */ +/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */ +/* = A * U(:,1:K) * W(1:K,1:K) */ + zgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[s_offset], + lds, &zzero, &b[b_offset], ldb); +/* The above call replaces the following two calls */ +/* that were used in the developing-testing phase. */ +/* CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & */ +/* LDS, ZZERO, Z, LDZ) */ +/* Save a copy of Z into B and free Z for holding */ +/* the Ritz vectors. */ +/* CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) */ + } + +/* Compute the Ritz vectors */ + if (wntvec) { + zgemm_("N", "N", m, k, k, &zone, &x[x_offset], ldx, &w[w_offset], + ldw, &zzero, &z__[z_offset], ldz); + } +/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K)) ! INTRINSIC */ + +/* BLAS CALL */ + if (wntres) { + i__1 = *k; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + z__1.r = -eigs[i__2].r, z__1.i = -eigs[i__2].i; + zaxpy_(m, &z__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * + y_dim1 + 1], &c__1); +/* Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i) ! INTR */ +/* BLAS */ + res[i__] = dznrm2_(m, &y[i__ * y_dim1 + 1], &c__1); +/* BLAS */ + } + } + } + + if (*whtsvd == 4) { + rwork[*n + 1] = xscl1; + rwork[*n + 2] = xscl2; + } + +/* Successful exit. */ + if (! badxy) { + *info = 0; + } else { +/* A warning on possible data inconsistency. */ +/* This should be a rare event. */ + *info = 4; + } +/* ............................................................ */ + return 0; +/* ...... */ +} /* zgedmd_ */ + diff --git a/lapack-netlib/SRC/zgedmdq.c b/lapack-netlib/SRC/zgedmdq.c index 447b23014..1815f0814 100644 --- a/lapack-netlib/SRC/zgedmdq.c +++ b/lapack-netlib/SRC/zgedmdq.c @@ -509,3 +509,785 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static integer c_n1 = -1; + +/* Subroutine */ int zgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, + char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, + doublecomplex *f, integer *ldf, doublecomplex *x, integer *ldx, + doublecomplex *y, integer *ldy, integer *nrnk, doublereal *tol, + integer *k, doublecomplex *eigs, doublecomplex *z__, integer *ldz, + doublereal *res, doublecomplex *b, integer *ldb, doublecomplex *v, + integer *ldv, doublecomplex *s, integer *lds, doublecomplex *zwork, + integer *lzwork, doublereal *work, integer *lwork, integer *iwork, + integer *liwork, integer *info) +{ + /* System generated locals */ + integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, + z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, + i__1, i__2; + + /* Local variables */ + doublereal zero; + integer info1; + extern logical lsame_(char *, char *); + char jobvl[1]; + integer minmn; + logical wantq; + integer mlwqr, olwqr; + logical wntex; + doublecomplex zzero; + extern /* Subroutine */ int zgedmd_(char *, char *, char *, char *, + integer *, integer *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *, integer *, doublereal *, integer *, + doublecomplex *, doublecomplex *, integer *, doublereal *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublereal *, integer *, integer *, integer *, integer *), xerbla_(char *, integer *); + integer mlwdmd, olwdmd; + logical sccolx, sccoly; + extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + integer *, doublecomplex *, doublecomplex *, integer *, integer * + ), zlacpy_(char *, integer *, integer *, doublecomplex *, integer + *, doublecomplex *, integer *), zlaset_(char *, integer *, + integer *, doublecomplex *, doublecomplex *, doublecomplex *, + integer *); + integer iminwr; + logical wntvec, wntvcf; + integer mlwgqr; + logical wntref; + integer mlwork, olwgqr, olwork, mlrwrk, mlwmqr, olwmqr; + logical lquery, wntres, wnttrf, wntvcq; + extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, integer *), zunmqr_(char *, char *, integer *, integer + *, integer *, doublecomplex *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, integer *); + doublereal one; + +/* March 2023 */ +/* ..... */ +/* USE iso_fortran_env */ +/* INTEGER, PARAMETER :: WP = real64 */ +/* ..... */ +/* Scalar arguments */ +/* Array arguments */ +/* ..... */ +/* Purpose */ +/* ======= */ +/* ZGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */ +/* a pair of data snapshot matrices, using a QR factorization */ +/* based compression of the data. For the input matrices */ +/* X and Y such that Y = A*X with an unaccessible matrix */ +/* A, ZGEDMDQ computes a certain number of Ritz pairs of A using */ +/* the standard Rayleigh-Ritz extraction from a subspace of */ +/* range(X) that is determined using the leading left singular */ +/* vectors of X. Optionally, ZGEDMDQ returns the residuals */ +/* of the computed Ritz pairs, the information needed for */ +/* a refinement of the Ritz vectors, or the eigenvectors of */ +/* the Exact DMD. */ +/* For further details see the references listed */ +/* below. For more details of the implementation see [3]. */ + +/* References */ +/* ========== */ +/* [1] P. Schmid: Dynamic mode decomposition of numerical */ +/* and experimental data, */ +/* Journal of Fluid Mechanics 656, 5-28, 2010. */ +/* [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */ +/* decompositions: analysis and enhancements, */ +/* SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */ +/* [3] Z. Drmac: A LAPACK implementation of the Dynamic */ +/* Mode Decomposition I. Technical report. AIMDyn Inc. */ +/* and LAPACK Working Note 298. */ +/* [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */ +/* Brunton, N. Kutz: On Dynamic Mode Decomposition: */ +/* Theory and Applications, Journal of Computational */ +/* Dynamics 1(2), 391 -421, 2014. */ + +/* Developed and supported by: */ +/* =========================== */ +/* Developed and coded by Zlatko Drmac, Faculty of Science, */ +/* University of Zagreb; drmac@math.hr */ +/* In cooperation with */ +/* AIMdyn Inc., Santa Barbara, CA. */ +/* and supported by */ +/* - DARPA SBIR project "Koopman Operator-Based Forecasting */ +/* for Nonstationary Processes from Near-Term, Limited */ +/* Observational Data" Contract No: W31P4Q-21-C-0007 */ +/* - DARPA PAI project "Physics-Informed Machine Learning */ +/* Methodologies" Contract No: HR0011-18-9-0033 */ +/* - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */ +/* Framework for Space-Time Analysis of Process Dynamics" */ +/* Contract No: HR0011-16-C-0116 */ +/* Any opinions, findings and conclusions or recommendations */ +/* expressed in this material are those of the author and */ +/* do not necessarily reflect the views of the DARPA SBIR */ +/* Program Office. */ +/* ============================================================ */ +/* Distribution Statement A: */ +/* Approved for Public Release, Distribution Unlimited. */ +/* Cleared by DARPA on September 29, 2022 */ +/* ============================================================ */ +/* ...................................................................... */ +/* Arguments */ +/* ========= */ +/* JOBS (input) CHARACTER*1 */ +/* Determines whether the initial data snapshots are scaled */ +/* by a diagonal matrix. The data snapshots are the columns */ +/* of F. The leading N-1 columns of F are denoted X and the */ +/* trailing N-1 columns are denoted Y. */ +/* 'S' :: The data snapshots matrices X and Y are multiplied */ +/* with a diagonal matrix D so that X*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'C' :: The snapshots are scaled as with the 'S' option. */ +/* If it is found that an i-th column of X is zero */ +/* vector and the corresponding i-th column of Y is */ +/* non-zero, then the i-th column of Y is set to */ +/* zero and a warning flag is raised. */ +/* 'Y' :: The data snapshots matrices X and Y are multiplied */ +/* by a diagonal matrix D so that Y*D has unit */ +/* nonzero columns (in the Euclidean 2-norm) */ +/* 'N' :: No data scaling. */ +/* ..... */ +/* JOBZ (input) CHARACTER*1 */ +/* Determines whether the eigenvectors (Koopman modes) will */ +/* be computed. */ +/* 'V' :: The eigenvectors (Koopman modes) will be computed */ +/* and returned in the matrix Z. */ +/* See the description of Z. */ +/* 'F' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product Z*V, where Z */ +/* is orthonormal and V contains the eigenvectors */ +/* of the corresponding Rayleigh quotient. */ +/* See the descriptions of F, V, Z. */ +/* 'Q' :: The eigenvectors (Koopman modes) will be returned */ +/* in factored form as the product Q*Z, where Z */ +/* contains the eigenvectors of the compression of the */ +/* underlying discretized operator onto the span of */ +/* the data snapshots. See the descriptions of F, V, Z. */ +/* Q is from the initial QR factorization. */ +/* 'N' :: The eigenvectors are not computed. */ +/* ..... */ +/* JOBR (input) CHARACTER*1 */ +/* Determines whether to compute the residuals. */ +/* 'R' :: The residuals for the computed eigenpairs will */ +/* be computed and stored in the array RES. */ +/* See the description of RES. */ +/* For this option to be legal, JOBZ must be 'V'. */ +/* 'N' :: The residuals are not computed. */ +/* ..... */ +/* JOBQ (input) CHARACTER*1 */ +/* Specifies whether to explicitly compute and return the */ +/* unitary matrix from the QR factorization. */ +/* 'Q' :: The matrix Q of the QR factorization of the data */ +/* snapshot matrix is computed and stored in the */ +/* array F. See the description of F. */ +/* 'N' :: The matrix Q is not explicitly computed. */ +/* ..... */ +/* JOBT (input) CHARACTER*1 */ +/* Specifies whether to return the upper triangular factor */ +/* from the QR factorization. */ +/* 'R' :: The matrix R of the QR factorization of the data */ +/* snapshot matrix F is returned in the array Y. */ +/* See the description of Y and Further details. */ +/* 'N' :: The matrix R is not returned. */ +/* ..... */ +/* JOBF (input) CHARACTER*1 */ +/* Specifies whether to store information needed for post- */ +/* processing (e.g. computing refined Ritz vectors) */ +/* 'R' :: The matrix needed for the refinement of the Ritz */ +/* vectors is computed and stored in the array B. */ +/* See the description of B. */ +/* 'E' :: The unscaled eigenvectors of the Exact DMD are */ +/* computed and returned in the array B. See the */ +/* description of B. */ +/* 'N' :: No eigenvector refinement data is computed. */ +/* To be useful on exit, this option needs JOBQ='Q'. */ +/* ..... */ +/* WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */ +/* Allows for a selection of the SVD algorithm from the */ +/* LAPACK library. */ +/* 1 :: ZGESVD (the QR SVD algorithm) */ +/* 2 :: ZGESDD (the Divide and Conquer algorithm; if enough */ +/* workspace available, this is the fastest option) */ +/* 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 */ +/* are the most accurate options) */ +/* 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 */ +/* are the most accurate options) */ +/* For the four methods above, a significant difference in */ +/* the accuracy of small singular values is possible if */ +/* the snapshots vary in norm so that X is severely */ +/* ill-conditioned. If small (smaller than EPS*||X||) */ +/* singular values are of interest and JOBS=='N', then */ +/* the options (3, 4) give the most accurate results, where */ +/* the option 4 is slightly better and with stronger */ +/* theoretical background. */ +/* If JOBS=='S', i.e. the columns of X will be normalized, */ +/* then all methods give nearly equally accurate results. */ +/* ..... */ +/* M (input) INTEGER, M >= 0 */ +/* The state space dimension (the number of rows of F). */ +/* ..... */ +/* N (input) INTEGER, 0 <= N <= M */ +/* The number of data snapshots from a single trajectory, */ +/* taken at equidistant discrete times. This is the */ +/* number of columns of F. */ +/* ..... */ +/* F (input/output) COMPLEX(KIND=WP) M-by-N array */ +/* > On entry, */ +/* the columns of F are the sequence of data snapshots */ +/* from a single trajectory, taken at equidistant discrete */ +/* times. It is assumed that the column norms of F are */ +/* in the range of the normalized floating point numbers. */ +/* < On exit, */ +/* If JOBQ == 'Q', the array F contains the orthogonal */ +/* matrix/factor of the QR factorization of the initial */ +/* data snapshots matrix F. See the description of JOBQ. */ +/* If JOBQ == 'N', the entries in F strictly below the main */ +/* diagonal contain, column-wise, the information on the */ +/* Householder vectors, as returned by ZGEQRF. The */ +/* remaining information to restore the orthogonal matrix */ +/* of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). */ +/* See the description of ZWORK. */ +/* ..... */ +/* LDF (input) INTEGER, LDF >= M */ +/* The leading dimension of the array F. */ +/* ..... */ +/* X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array */ +/* X is used as workspace to hold representations of the */ +/* leading N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, the leading K columns of X contain the leading */ +/* K left singular vectors of the above described content */ +/* of X. To lift them to the space of the left singular */ +/* vectors U(:,1:K) of the input data, pre-multiply with the */ +/* Q factor from the initial QR factorization. */ +/* See the descriptions of F, K, V and Z. */ +/* ..... */ +/* LDX (input) INTEGER, LDX >= N */ +/* The leading dimension of the array X. */ +/* ..... */ +/* Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array */ +/* Y is used as workspace to hold representations of the */ +/* trailing N-1 snapshots in the orthonormal basis computed */ +/* in the QR factorization of F. */ +/* On exit, */ +/* If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */ +/* triangular factor from the QR factorization of the data */ +/* snapshot matrix F. */ +/* ..... */ +/* LDY (input) INTEGER , LDY >= N */ +/* The leading dimension of the array Y. */ +/* ..... */ +/* NRNK (input) INTEGER */ +/* Determines the mode how to compute the numerical rank, */ +/* i.e. how to truncate small singular values of the input */ +/* matrix X. On input, if */ +/* NRNK = -1 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(1) */ +/* This option is recommended. */ +/* NRNK = -2 :: i-th singular value sigma(i) is truncated */ +/* if sigma(i) <= TOL*sigma(i-1) */ +/* This option is included for R&D purposes. */ +/* It requires highly accurate SVD, which */ +/* may not be feasible. */ +/* The numerical rank can be enforced by using positive */ +/* value of NRNK as follows: */ +/* 0 < NRNK <= N-1 :: at most NRNK largest singular values */ +/* will be used. If the number of the computed nonzero */ +/* singular values is less than NRNK, then only those */ +/* nonzero values will be used and the actually used */ +/* dimension is less than NRNK. The actual number of */ +/* the nonzero singular values is returned in the variable */ +/* K. See the description of K. */ +/* ..... */ +/* TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */ +/* The tolerance for truncating small singular values. */ +/* See the description of NRNK. */ +/* ..... */ +/* K (output) INTEGER, 0 <= K <= N */ +/* The dimension of the SVD/POD basis for the leading N-1 */ +/* data snapshots (columns of F) and the number of the */ +/* computed Ritz pairs. The value of K is determined */ +/* according to the rule set by the parameters NRNK and */ +/* TOL. See the descriptions of NRNK and TOL. */ +/* ..... */ +/* EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array */ +/* The leading K (K<=N-1) entries of EIGS contain */ +/* the computed eigenvalues (Ritz values). */ +/* See the descriptions of K, and Z. */ +/* ..... */ +/* Z (workspace/output) COMPLEX(KIND=WP) M-by-(N-1) array */ +/* If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */ +/* is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */ +/* If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */ +/* Z*V, where Z contains orthonormal matrix (the product of */ +/* Q from the initial QR factorization and the SVD/POD_basis */ +/* returned by ZGEDMD in X) and the second factor (the */ +/* eigenvectors of the Rayleigh quotient) is in the array V, */ +/* as returned by ZGEDMD. That is, X(:,1:K)*V(:,i) */ +/* is an eigenvector corresponding to EIGS(i). The columns */ +/* of V(1:K,1:K) are the computed eigenvectors of the */ +/* K-by-K Rayleigh quotient. */ +/* See the descriptions of EIGS, X and V. */ +/* ..... */ +/* LDZ (input) INTEGER , LDZ >= M */ +/* The leading dimension of the array Z. */ +/* ..... */ +/* RES (output) REAL(KIND=WP) (N-1)-by-1 array */ +/* RES(1:K) contains the residuals for the K computed */ +/* Ritz pairs, */ +/* RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */ +/* See the description of EIGS and Z. */ +/* ..... */ +/* B (output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array. */ +/* IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */ +/* be used for computing the refined vectors; see further */ +/* details in the provided references. */ +/* If JOBF == 'E', B(1:N,1;K) contains */ +/* A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */ +/* Exact DMD, up to scaling by the inverse eigenvalues. */ +/* In both cases, the content of B can be lifted to the */ +/* original dimension of the input data by pre-multiplying */ +/* with the Q factor from the initial QR factorization. */ +/* Here A denotes a compression of the underlying operator. */ +/* See the descriptions of F and X. */ +/* If JOBF =='N', then B is not referenced. */ +/* ..... */ +/* LDB (input) INTEGER, LDB >= MIN(M,N) */ +/* The leading dimension of the array B. */ +/* ..... */ +/* V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */ +/* On exit, V(1:K,1:K) V contains the K eigenvectors of */ +/* the Rayleigh quotient. The Ritz vectors */ +/* (returned in Z) are the product of Q from the initial QR */ +/* factorization (see the description of F) X (see the */ +/* description of X) and V. */ +/* ..... */ +/* LDV (input) INTEGER, LDV >= N-1 */ +/* The leading dimension of the array V. */ +/* ..... */ +/* S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */ +/* The array S(1:K,1:K) is used for the matrix Rayleigh */ +/* quotient. This content is overwritten during */ +/* the eigenvalue decomposition by ZGEEV. */ +/* See the description of K. */ +/* ..... */ +/* LDS (input) INTEGER, LDS >= N-1 */ +/* The leading dimension of the array S. */ +/* ..... */ +/* ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* ZWORK(1:MIN(M,N)) contains the scalar factors of the */ +/* elementary reflectors as returned by ZGEQRF of the */ +/* M-by-N input matrix F. */ +/* If the call to ZGEDMDQ is only workspace query, then */ +/* ZWORK(1) contains the minimal complex workspace length and */ +/* ZWORK(2) is the optimal complex workspace length. */ +/* Hence, the length of work is at least 2. */ +/* See the description of LZWORK. */ +/* ..... */ +/* LZWORK (input) INTEGER */ +/* The minimal length of the workspace vector ZWORK. */ +/* LZWORK is calculated as follows: */ +/* Let MLWQR = N (minimal workspace for ZGEQRF[M,N]) */ +/* MLWDMD = minimal workspace for ZGEDMD (see the */ +/* description of LWORK in ZGEDMD) */ +/* MLWMQR = N (minimal workspace for */ +/* ZUNMQR['L','N',M,N,N]) */ +/* MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) */ +/* MINMN = MIN(M,N) */ +/* Then */ +/* LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) */ +/* is further updated as follows: */ +/* if JOBZ == 'V' or JOBZ == 'F' THEN */ +/* LZWORK = MAX(LZWORK, MINMN+MLWMQR) */ +/* if JOBQ == 'Q' THEN */ +/* LZWORK = MAX(ZLWORK, MINMN+MLWGQR) */ + +/* ..... */ +/* WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */ +/* On exit, */ +/* WORK(1:N-1) contains the singular values of */ +/* the input submatrix F(1:M,1:N-1). */ +/* If the call to ZGEDMDQ is only workspace query, then */ +/* WORK(1) contains the minimal workspace length and */ +/* WORK(2) is the optimal workspace length. hence, the */ +/* length of work is at least 2. */ +/* See the description of LWORK. */ +/* ..... */ +/* LWORK (input) INTEGER */ +/* The minimal length of the workspace vector WORK. */ +/* LWORK is the same as in ZGEDMD, because in ZGEDMDQ */ +/* only ZGEDMD requires real workspace for snapshots */ +/* of dimensions MIN(M,N)-by-(N-1). */ +/* If on entry LWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace length for WORK. */ +/* ..... */ +/* IWORK (workspace/output) INTEGER LIWORK-by-1 array */ +/* Workspace that is required only if WHTSVD equals */ +/* 2 , 3 or 4. (See the description of WHTSVD). */ +/* If on entry LWORK =-1 or LIWORK=-1, then the */ +/* minimal length of IWORK is computed and returned in */ +/* IWORK(1). See the description of LIWORK. */ +/* ..... */ +/* LIWORK (input) INTEGER */ +/* The minimal length of the workspace vector IWORK. */ +/* If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */ +/* Let M1=MIN(M,N), N1=N-1. Then */ +/* If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */ +/* If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */ +/* If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */ +/* If on entry LIWORK = -1, then a workspace query is */ +/* assumed and the procedure only computes the minimal */ +/* and the optimal workspace lengths for both WORK and */ +/* IWORK. See the descriptions of WORK and IWORK. */ +/* ..... */ +/* INFO (output) INTEGER */ +/* -i < 0 :: On entry, the i-th argument had an */ +/* illegal value */ +/* = 0 :: Successful return. */ +/* = 1 :: Void input. Quick exit (M=0 or N=0). */ +/* = 2 :: The SVD computation of X did not converge. */ +/* Suggestion: Check the input data and/or */ +/* repeat with different WHTSVD. */ +/* = 3 :: The computation of the eigenvalues did not */ +/* converge. */ +/* = 4 :: If data scaling was requested on input and */ +/* the procedure found inconsistency in the data */ +/* such that for some column index i, */ +/* X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */ +/* to zero if JOBS=='C'. The computation proceeds */ +/* with original or modified data and warning */ +/* flag is set with INFO=4. */ +/* ............................................................. */ +/* ............................................................. */ +/* Parameters */ +/* ~~~~~~~~~~ */ +/* COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) */ + +/* Local scalars */ +/* ~~~~~~~~~~~~~ */ + +/* External functions (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~ */ + +/* External subroutines (BLAS and LAPACK) */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* External subroutines */ +/* ~~~~~~~~~~~~~~~~~~~~ */ +/* Intrinsic functions */ +/* ~~~~~~~~~~~~~~~~~~~ */ +/* .......................................................... */ + /* Parameter adjustments */ + f_dim1 = *ldf; + f_offset = 1 + f_dim1 * 1; + f -= f_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + y_dim1 = *ldy; + y_offset = 1 + y_dim1 * 1; + y -= y_offset; + --eigs; + z_dim1 = *ldz; + z_offset = 1 + z_dim1 * 1; + z__ -= z_offset; + --res; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + v_dim1 = *ldv; + v_offset = 1 + v_dim1 * 1; + v -= v_offset; + s_dim1 = *lds; + s_offset = 1 + s_dim1 * 1; + s -= s_offset; + --zwork; + --work; + --iwork; + + /* Function Body */ + one = 1.f; + zero = 0.f; + zzero.r = 0.f, zzero.i = 0.f; + +/* Test the input arguments */ + wntres = lsame_(jobr, "R"); + sccolx = lsame_(jobs, "S") || lsame_(jobs, "C"); + sccoly = lsame_(jobs, "Y"); + wntvec = lsame_(jobz, "V"); + wntvcf = lsame_(jobz, "F"); + wntvcq = lsame_(jobz, "Q"); + wntref = lsame_(jobf, "R"); + wntex = lsame_(jobf, "E"); + wantq = lsame_(jobq, "Q"); + wnttrf = lsame_(jobt, "R"); + minmn = f2cmin(*m,*n); + *info = 0; + lquery = *lzwork == -1 || *lwork == -1 || *liwork == -1; + + if (! (sccolx || sccoly || lsame_(jobs, "N"))) { + *info = -1; + } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) { + *info = -2; + } else if (! (wntres || lsame_(jobr, "N")) || + wntres && lsame_(jobz, "N")) { + *info = -3; + } else if (! (wantq || lsame_(jobq, "N"))) { + *info = -4; + } else if (! (wnttrf || lsame_(jobt, "N"))) { + *info = -5; + } else if (! (wntref || wntex || lsame_(jobf, "N"))) + { + *info = -6; + } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == + 4)) { + *info = -7; + } else if (*m < 0) { + *info = -8; + } else if (*n < 0 || *n > *m + 1) { + *info = -9; + } else if (*ldf < *m) { + *info = -11; + } else if (*ldx < minmn) { + *info = -13; + } else if (*ldy < minmn) { + *info = -15; + } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) { + *info = -16; + } else if (*tol < zero || *tol >= one) { + *info = -17; + } else if (*ldz < *m) { + *info = -21; + } else if ((wntref || wntex) && *ldb < minmn) { + *info = -24; + } else if (*ldv < *n - 1) { + *info = -26; + } else if (*lds < *n - 1) { + *info = -28; + } + + if (wntvec || wntvcf || wntvcq) { + *(unsigned char *)jobvl = 'V'; + } else { + *(unsigned char *)jobvl = 'N'; + } + if (*info == 0) { +/* Compute the minimal and the optimal workspace */ +/* requirements. Simulate running the code and */ +/* determine minimal and optimal sizes of the */ +/* workspace at any moment of the run. */ + if (*n == 0 || *n == 1) { +/* All output except K is void. INFO=1 signals */ +/* the void input. In case of a workspace query, */ +/* the minimal workspace lengths are returned. */ + if (lquery) { + iwork[1] = 1; + zwork[1].r = 2., zwork[1].i = 0.; + zwork[2].r = 2., zwork[2].i = 0.; + work[1] = 2.; + work[2] = 2.; + } else { + *k = 0; + } + *info = 1; + return 0; + } + mlrwrk = 2; + mlwork = 2; + olwork = 2; + iminwr = 1; + mlwqr = f2cmax(1,*n); +/* Minimal workspace length for ZGEQRF. */ +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + zgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[1], &c_n1, & + info1); + olwqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwqr; + olwork = f2cmax(i__1,i__2); + } + i__1 = *n - 1; + zgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], + ldx, &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset] + , ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[ + s_offset], lds, &zwork[1], &c_n1, &work[1], &c_n1, &iwork[1], + &c_n1, &info1); + mlwdmd = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwdmd; + mlwork = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = mlrwrk, i__2 = (integer) work[1]; + mlrwrk = f2cmax(i__1,i__2); + iminwr = f2cmax(iminwr,iwork[1]); + if (lquery) { + olwdmd = (integer) zwork[2].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwdmd; + olwork = f2cmax(i__1,i__2); + } + if (wntvec || wntvcf) { + mlwmqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwmqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + zunmqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &zwork[1], + &z__[z_offset], ldz, &zwork[1], &c_n1, &info1); + olwmqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwmqr; + olwork = f2cmax(i__1,i__2); + } + } + if (wantq) { + mlwgqr = f2cmax(1,*n); +/* Computing MAX */ + i__1 = mlwork, i__2 = minmn + mlwgqr; + mlwork = f2cmax(i__1,i__2); + if (lquery) { + zungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], & + zwork[1], &c_n1, &info1); + olwgqr = (integer) zwork[1].r; +/* Computing MAX */ + i__1 = olwork, i__2 = minmn + olwgqr; + olwork = f2cmax(i__1,i__2); + } + } + if (*liwork < iminwr && ! lquery) { + *info = -34; + } + if (*lwork < mlrwrk && ! lquery) { + *info = -32; + } + if (*lzwork < mlwork && ! lquery) { + *info = -30; + } + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGEDMDQ", &i__1); + return 0; + } else if (lquery) { +/* Return minimal and optimal workspace sizes */ + iwork[1] = iminwr; + zwork[1].r = (doublereal) mlwork, zwork[1].i = 0.; + zwork[2].r = (doublereal) olwork, zwork[2].i = 0.; + work[1] = (doublereal) mlrwrk; + work[2] = (doublereal) mlrwrk; + return 0; + } +/* ..... */ +/* Initial QR factorization that is used to represent the */ +/* snapshots as elements of lower dimensional subspace. */ +/* For large scale computation with M >> N, at this place */ +/* one can use an out of core QRF. */ + + i__1 = *lzwork - minmn; + zgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[minmn + 1], &i__1, & + info1); + +/* Define X and Y as the snapshots representations in the */ +/* orthogonal basis computed in the QR factorization. */ +/* X corresponds to the leading N-1 and Y to the trailing */ +/* N-1 snapshots. */ + i__1 = *n - 1; + zlaset_("L", &minmn, &i__1, &zzero, &zzero, &x[x_offset], ldx); + i__1 = *n - 1; + zlacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx); + i__1 = *n - 1; + zlacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy); + if (*m >= 3) { + i__1 = minmn - 2; + i__2 = *n - 2; + zlaset_("L", &i__1, &i__2, &zzero, &zzero, &y[y_dim1 + 3], ldy); + } + +/* Compute the DMD of the projected snapshot pairs (X,Y) */ + i__1 = *n - 1; + i__2 = *lzwork - minmn; + zgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx, + &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset], ldz, & + res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[s_offset], lds, & + zwork[minmn + 1], &i__2, &work[1], lwork, &iwork[1], liwork, & + info1); + if (info1 == 2 || info1 == 3) { +/* Return with error code. See ZGEDMD for details. */ + *info = info1; + return 0; + } else { + *info = info1; + } + +/* The Ritz vectors (Koopman modes) can be explicitly */ +/* formed or returned in factored form. */ + if (wntvec) { +/* Compute the eigenvectors explicitly. */ + if (*m > minmn) { + i__1 = *m - minmn; + zlaset_("A", &i__1, k, &zzero, &zzero, &z__[minmn + 1 + z_dim1], + ldz); + } + i__1 = *lzwork - minmn; + zunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[ + z_offset], ldz, &zwork[minmn + 1], &i__1, &info1); + } else if (wntvcf) { +/* Return the Ritz vectors (eigenvectors) in factored */ +/* form Z*V, where Z contains orthonormal matrix (the */ +/* product of Q from the initial QR factorization and */ +/* the SVD/POD_basis returned by ZGEDMD in X) and the */ +/* second factor (the eigenvectors of the Rayleigh */ +/* quotient) is in the array V, as returned by ZGEDMD. */ + zlacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz); + if (*m > *n) { + i__1 = *m - *n; + zlaset_("A", &i__1, k, &zzero, &zzero, &z__[*n + 1 + z_dim1], ldz); + } + i__1 = *lzwork - minmn; + zunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[ + z_offset], ldz, &zwork[minmn + 1], &i__1, &info1); + } + +/* Some optional output variables: */ + +/* The upper triangular factor R in the initial QR */ +/* factorization is optionally returned in the array Y. */ +/* This is useful if this call to ZGEDMDQ is to be */ +/* followed by a streaming DMD that is implemented in a */ +/* QR compressed form. */ + if (wnttrf) { +/* Return the upper triangular R in Y */ + zlaset_("A", &minmn, n, &zzero, &zzero, &y[y_offset], ldy); + zlacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy); + } + +/* The orthonormal/unitary factor Q in the initial QR */ +/* factorization is optionally returned in the array F. */ +/* Same as with the triangular factor above, this is */ +/* useful in a streaming DMD. */ + if (wantq) { +/* Q overwrites F */ + i__1 = *lzwork - minmn; + zungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &zwork[minmn + + 1], &i__1, &info1); + } + + return 0; + +} /* zgedmdq_ */ + From fa6d06359a28ddba5996d9e25120d9907d83a1dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 21 Jun 2023 17:17:31 +0200 Subject: [PATCH 148/718] correct list placement of zgedmd/zgedmdq --- cmake/lapack.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 12127531d..d339f0ce9 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -915,7 +915,8 @@ set(ZLASRC zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c - zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c) + zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c + zgedmd.c zgedmdq.c) set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c @@ -925,7 +926,7 @@ set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c - zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c zgedmd.c zgedmdq.c) + zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c) if(USE_XBLAS) From 42fd3f4ec7d091519eafa60e4e08849aa19af98d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 21 Jun 2023 22:52:31 +0200 Subject: [PATCH 149/718] Add standard module path for Windows flang --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 16b9da4f5..65ef538e9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -115,7 +115,7 @@ jobs: mkdir build cd build call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" - cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. + cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. cmake --build . --config Release ctest From 9d3154701ca0391d1e3dedd7b79739bef7dbe65d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Jun 2023 16:09:14 +0200 Subject: [PATCH 150/718] Fix confusing use of "minor" in inline documentation (Reference-LAPACK PR849) --- lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f | 4 ++-- lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f | 4 ++-- lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f | 4 ++-- lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f index f9384b416..1b8e53cc2 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f index 1575bd95c..e0a621b2e 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f index 67ebae335..f3b66a9e3 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f index bef27afa7..dda42faf9 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> From f12b1c7b08c3f6aa00a0133bcf056be91adc7bfd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Jun 2023 16:10:27 +0200 Subject: [PATCH 151/718] Fix confusing use of "minor" in inline documentation (Reference-LAPACK PR849) --- lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f | 4 ++-- lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f | 4 ++-- lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f | 4 ++-- lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f index c810a1533..b9dffa4ec 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f index e49200ea5..e68a559ba 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f index 65895502b..7401f8844 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f index 449c7ac95..80b7c7f43 100644 --- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f +++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f @@ -79,8 +79,8 @@ C> \verbatim C> INFO is INTEGER C> = 0: successful exit C> < 0: if INFO = -i, the i-th argument had an illegal value -C> > 0: if INFO = i, the leading minor of order i is not -C> positive definite, and the factorization could not be +C> > 0: if INFO = i, the leading principal minor of order i +C> is not positive, and the factorization could not be C> completed. C> \endverbatim C> From 219a73a394d8bf4cd31a7cd1c534f83f205c5c70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Jun 2023 16:18:58 +0200 Subject: [PATCH 152/718] Fix confusing use of "minor" in inline documentation (Reference-LAPACK PR849) --- lapack-netlib/SRC/zcposv.f | 8 ++++---- lapack-netlib/SRC/zhegv.f | 2 +- lapack-netlib/SRC/zhegv_2stage.f | 2 +- lapack-netlib/SRC/zhegvd.f | 2 +- lapack-netlib/SRC/zhegvx.f | 2 +- lapack-netlib/SRC/zhpgv.f | 2 +- lapack-netlib/SRC/zhpgvd.f | 2 +- lapack-netlib/SRC/zhpgvx.f | 2 +- lapack-netlib/SRC/zla_porpvgrw.f | 6 +++--- lapack-netlib/SRC/zpbsv.f | 6 +++--- lapack-netlib/SRC/zpbsvx.f | 10 +++++----- lapack-netlib/SRC/zpbtf2.f | 4 ++-- lapack-netlib/SRC/zpbtrf.f | 4 ++-- lapack-netlib/SRC/zpftrf.f | 4 ++-- lapack-netlib/SRC/zposv.f | 6 +++--- lapack-netlib/SRC/zposvx.f | 10 +++++----- lapack-netlib/SRC/zposvxx.f | 2 +- lapack-netlib/SRC/zpotf2.f | 4 ++-- lapack-netlib/SRC/zpotrf.f | 4 ++-- lapack-netlib/SRC/zpotrf2.f | 4 ++-- lapack-netlib/SRC/zppsv.f | 6 +++--- lapack-netlib/SRC/zppsvx.f | 10 +++++----- lapack-netlib/SRC/zpptrf.f | 4 ++-- lapack-netlib/SRC/zpteqr.f | 4 ++-- lapack-netlib/SRC/zptsv.f | 4 ++-- lapack-netlib/SRC/zptsvx.f | 10 +++++----- lapack-netlib/SRC/zpttrf.f | 4 ++-- 27 files changed, 64 insertions(+), 64 deletions(-) diff --git a/lapack-netlib/SRC/zcposv.f b/lapack-netlib/SRC/zcposv.f index 9b1940d60..649d2c049 100644 --- a/lapack-netlib/SRC/zcposv.f +++ b/lapack-netlib/SRC/zcposv.f @@ -187,10 +187,10 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of -*> (COMPLEX*16) A is not positive definite, so the -*> factorization could not be completed, and the solution -*> has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of (COMPLEX*16) A is not positive, so the factorization +*> could not be completed, and the solution has not been +*> computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zhegv.f b/lapack-netlib/SRC/zhegv.f index 41657e3be..c973bd0fc 100644 --- a/lapack-netlib/SRC/zhegv.f +++ b/lapack-netlib/SRC/zhegv.f @@ -160,7 +160,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhegv_2stage.f b/lapack-netlib/SRC/zhegv_2stage.f index fda651e5e..91ac09311 100644 --- a/lapack-netlib/SRC/zhegv_2stage.f +++ b/lapack-netlib/SRC/zhegv_2stage.f @@ -179,7 +179,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f index eeda656ad..2c3586517 100644 --- a/lapack-netlib/SRC/zhegvd.f +++ b/lapack-netlib/SRC/zhegvd.f @@ -212,7 +212,7 @@ *> the submatrix lying in rows and columns INFO/(N+1) *> through mod(INFO,N+1); *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhegvx.f b/lapack-netlib/SRC/zhegvx.f index ac9f9ef1a..71ed1c4ca 100644 --- a/lapack-netlib/SRC/zhegvx.f +++ b/lapack-netlib/SRC/zhegvx.f @@ -280,7 +280,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhpgv.f b/lapack-netlib/SRC/zhpgv.f index 72876b6e4..b92168555 100644 --- a/lapack-netlib/SRC/zhpgv.f +++ b/lapack-netlib/SRC/zhpgv.f @@ -144,7 +144,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not convergeto zero; *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f index e96e39738..e9688f0c7 100644 --- a/lapack-netlib/SRC/zhpgvd.f +++ b/lapack-netlib/SRC/zhpgvd.f @@ -205,7 +205,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not convergeto zero; *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zhpgvx.f b/lapack-netlib/SRC/zhpgvx.f index 94d7f7733..de75b486b 100644 --- a/lapack-netlib/SRC/zhpgvx.f +++ b/lapack-netlib/SRC/zhpgvx.f @@ -250,7 +250,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/zla_porpvgrw.f b/lapack-netlib/SRC/zla_porpvgrw.f index 9b381a072..897589aa0 100644 --- a/lapack-netlib/SRC/zla_porpvgrw.f +++ b/lapack-netlib/SRC/zla_porpvgrw.f @@ -142,9 +142,9 @@ * .. Executable Statements .. UPPER = LSAME( 'Upper', UPLO ) * -* DPOTRF will have factored only the NCOLSxNCOLS leading minor, so -* we restrict the growth search to that minor and use only the first -* 2*NCOLS workspace entries. +* DPOTRF will have factored only the NCOLSxNCOLS leading submatrix, +* so we restrict the growth search to that submatrix and use only +* the first 2*NCOLS workspace entries. * RPVGRW = 1.0D+0 DO I = 1, 2*NCOLS diff --git a/lapack-netlib/SRC/zpbsv.f b/lapack-netlib/SRC/zpbsv.f index fe6baf8b5..ef212bbc3 100644 --- a/lapack-netlib/SRC/zpbsv.f +++ b/lapack-netlib/SRC/zpbsv.f @@ -119,9 +119,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zpbsvx.f b/lapack-netlib/SRC/zpbsvx.f index 1efd2fd1e..724102376 100644 --- a/lapack-netlib/SRC/zpbsvx.f +++ b/lapack-netlib/SRC/zpbsvx.f @@ -70,7 +70,7 @@ *> where U is an upper triangular band matrix, and L is a lower *> triangular band matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -280,10 +280,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/zpbtf2.f b/lapack-netlib/SRC/zpbtf2.f index 7b90af47e..39bf6c3fa 100644 --- a/lapack-netlib/SRC/zpbtf2.f +++ b/lapack-netlib/SRC/zpbtf2.f @@ -97,8 +97,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zpbtrf.f b/lapack-netlib/SRC/zpbtrf.f index 61cd1d434..80afb0836 100644 --- a/lapack-netlib/SRC/zpbtrf.f +++ b/lapack-netlib/SRC/zpbtrf.f @@ -92,8 +92,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zpftrf.f b/lapack-netlib/SRC/zpftrf.f index 6d18f2cb3..4d4a5116e 100644 --- a/lapack-netlib/SRC/zpftrf.f +++ b/lapack-netlib/SRC/zpftrf.f @@ -91,8 +91,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> *> Further Notes on RFP Format: diff --git a/lapack-netlib/SRC/zposv.f b/lapack-netlib/SRC/zposv.f index 3bb625876..0e91cde2f 100644 --- a/lapack-netlib/SRC/zposv.f +++ b/lapack-netlib/SRC/zposv.f @@ -110,9 +110,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zposvx.f b/lapack-netlib/SRC/zposvx.f index f9e9b1d5f..6c06dbd57 100644 --- a/lapack-netlib/SRC/zposvx.f +++ b/lapack-netlib/SRC/zposvx.f @@ -70,7 +70,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -276,10 +276,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/zposvxx.f b/lapack-netlib/SRC/zposvxx.f index 6735fab71..3886c66f0 100644 --- a/lapack-netlib/SRC/zposvxx.f +++ b/lapack-netlib/SRC/zposvxx.f @@ -87,7 +87,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A (see argument RCOND). If the reciprocal of the condition number diff --git a/lapack-netlib/SRC/zpotf2.f b/lapack-netlib/SRC/zpotf2.f index eb88d617c..b48f3d654 100644 --- a/lapack-netlib/SRC/zpotf2.f +++ b/lapack-netlib/SRC/zpotf2.f @@ -89,8 +89,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zpotrf.f b/lapack-netlib/SRC/zpotrf.f index 3b7018276..3edae84fb 100644 --- a/lapack-netlib/SRC/zpotrf.f +++ b/lapack-netlib/SRC/zpotrf.f @@ -87,8 +87,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zpotrf2.f b/lapack-netlib/SRC/zpotrf2.f index 859ddc75f..67430f231 100644 --- a/lapack-netlib/SRC/zpotrf2.f +++ b/lapack-netlib/SRC/zpotrf2.f @@ -86,8 +86,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zppsv.f b/lapack-netlib/SRC/zppsv.f index 19536e204..f466266e4 100644 --- a/lapack-netlib/SRC/zppsv.f +++ b/lapack-netlib/SRC/zppsv.f @@ -104,9 +104,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zppsvx.f b/lapack-netlib/SRC/zppsvx.f index f94badf78..60d07cbc7 100644 --- a/lapack-netlib/SRC/zppsvx.f +++ b/lapack-netlib/SRC/zppsvx.f @@ -69,7 +69,7 @@ *> where U is an upper triangular matrix, L is a lower triangular *> matrix, and **H indicates conjugate transpose. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -262,10 +262,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/zpptrf.f b/lapack-netlib/SRC/zpptrf.f index a34d63913..a103e5eca 100644 --- a/lapack-netlib/SRC/zpptrf.f +++ b/lapack-netlib/SRC/zpptrf.f @@ -79,8 +79,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/zpteqr.f b/lapack-netlib/SRC/zpteqr.f index a81a6ad94..897136c76 100644 --- a/lapack-netlib/SRC/zpteqr.f +++ b/lapack-netlib/SRC/zpteqr.f @@ -123,8 +123,8 @@ *> < 0: if INFO = -i, the i-th argument had an illegal value. *> > 0: if INFO = i, and i is: *> <= N the Cholesky factorization of the matrix could -*> not be performed because the i-th principal minor -*> was not positive definite. +*> not be performed because the leading principal +*> minor of order i was not positive. *> > N the SVD algorithm failed to converge; *> if INFO = N+i, i off-diagonal elements of the *> bidiagonal factor did not converge to zero. diff --git a/lapack-netlib/SRC/zptsv.f b/lapack-netlib/SRC/zptsv.f index d53e17a34..191adaff6 100644 --- a/lapack-netlib/SRC/zptsv.f +++ b/lapack-netlib/SRC/zptsv.f @@ -94,8 +94,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the solution has not been +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the solution has not been *> computed. The factorization has not been completed *> unless i = N. *> \endverbatim diff --git a/lapack-netlib/SRC/zptsvx.f b/lapack-netlib/SRC/zptsvx.f index 4a40768f1..94409a7a1 100644 --- a/lapack-netlib/SRC/zptsvx.f +++ b/lapack-netlib/SRC/zptsvx.f @@ -60,7 +60,7 @@ *> factorization can also be regarded as having the form *> A = U**H*D*U. *> -*> 2. If the leading i-by-i principal minor is not positive definite, +*> 2. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -205,10 +205,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/zpttrf.f b/lapack-netlib/SRC/zpttrf.f index a106ec419..75ef847c4 100644 --- a/lapack-netlib/SRC/zpttrf.f +++ b/lapack-netlib/SRC/zpttrf.f @@ -71,8 +71,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite; if k < N, the factorization could not +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive; if k < N, the factorization could not *> be completed, while if k = N, the factorization was *> completed, but D(N) <= 0. *> \endverbatim From d9a6cacef737593c5f6e7c91396da7e3769afede Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Jun 2023 16:20:49 +0200 Subject: [PATCH 153/718] Fix confusing use of "minor" in inline documentation (Reference-LAPACK PR849) --- lapack-netlib/SRC/sla_porpvgrw.f | 6 +++--- lapack-netlib/SRC/spbsv.f | 6 +++--- lapack-netlib/SRC/spbsvx.f | 10 +++++----- lapack-netlib/SRC/spbtf2.f | 4 ++-- lapack-netlib/SRC/spbtrf.f | 4 ++-- lapack-netlib/SRC/spftrf.f | 4 ++-- lapack-netlib/SRC/sposv.f | 6 +++--- lapack-netlib/SRC/sposvx.f | 10 +++++----- lapack-netlib/SRC/sposvxx.f | 2 +- lapack-netlib/SRC/spotf2.f | 4 ++-- lapack-netlib/SRC/spotrf.f | 4 ++-- lapack-netlib/SRC/spotrf2.f | 4 ++-- lapack-netlib/SRC/sppsv.f | 6 +++--- lapack-netlib/SRC/sppsvx.f | 10 +++++----- lapack-netlib/SRC/spptrf.f | 4 ++-- lapack-netlib/SRC/spteqr.f | 4 ++-- lapack-netlib/SRC/sptsv.f | 4 ++-- lapack-netlib/SRC/sptsvx.f | 10 +++++----- lapack-netlib/SRC/spttrf.f | 4 ++-- lapack-netlib/SRC/ssbgvx.f | 2 +- lapack-netlib/SRC/sspgv.f | 2 +- lapack-netlib/SRC/sspgvd.f | 2 +- lapack-netlib/SRC/sspgvx.f | 2 +- lapack-netlib/SRC/ssygv.f | 2 +- lapack-netlib/SRC/ssygv_2stage.f | 2 +- lapack-netlib/SRC/ssygvd.f | 2 +- lapack-netlib/SRC/ssygvx.f | 2 +- 27 files changed, 61 insertions(+), 61 deletions(-) diff --git a/lapack-netlib/SRC/sla_porpvgrw.f b/lapack-netlib/SRC/sla_porpvgrw.f index a97e7f72a..8064bf7fa 100644 --- a/lapack-netlib/SRC/sla_porpvgrw.f +++ b/lapack-netlib/SRC/sla_porpvgrw.f @@ -132,9 +132,9 @@ * UPPER = LSAME( 'Upper', UPLO ) * -* SPOTRF will have factored only the NCOLSxNCOLS leading minor, so -* we restrict the growth search to that minor and use only the first -* 2*NCOLS workspace entries. +* SPOTRF will have factored only the NCOLSxNCOLS leading submatrix, +* so we restrict the growth search to that submatrix and use only +* the first 2*NCOLS workspace entries. * RPVGRW = 1.0 DO I = 1, 2*NCOLS diff --git a/lapack-netlib/SRC/spbsv.f b/lapack-netlib/SRC/spbsv.f index 2d084424e..8929321ea 100644 --- a/lapack-netlib/SRC/spbsv.f +++ b/lapack-netlib/SRC/spbsv.f @@ -119,9 +119,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/spbsvx.f b/lapack-netlib/SRC/spbsvx.f index 27907eb85..6abb6caae 100644 --- a/lapack-netlib/SRC/spbsvx.f +++ b/lapack-netlib/SRC/spbsvx.f @@ -71,7 +71,7 @@ *> where U is an upper triangular band matrix, and L is a lower *> triangular band matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -281,10 +281,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/spbtf2.f b/lapack-netlib/SRC/spbtf2.f index ff08bc327..b5aac1ccd 100644 --- a/lapack-netlib/SRC/spbtf2.f +++ b/lapack-netlib/SRC/spbtf2.f @@ -97,8 +97,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/spbtrf.f b/lapack-netlib/SRC/spbtrf.f index ef5dcbb98..d3ae216e5 100644 --- a/lapack-netlib/SRC/spbtrf.f +++ b/lapack-netlib/SRC/spbtrf.f @@ -92,8 +92,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/spftrf.f b/lapack-netlib/SRC/spftrf.f index 9e2c11eea..0ed0e3abd 100644 --- a/lapack-netlib/SRC/spftrf.f +++ b/lapack-netlib/SRC/spftrf.f @@ -91,8 +91,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/sposv.f b/lapack-netlib/SRC/sposv.f index fe0a35a56..336332d2d 100644 --- a/lapack-netlib/SRC/sposv.f +++ b/lapack-netlib/SRC/sposv.f @@ -110,9 +110,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/sposvx.f b/lapack-netlib/SRC/sposvx.f index bcf38c7ea..0770897d2 100644 --- a/lapack-netlib/SRC/sposvx.f +++ b/lapack-netlib/SRC/sposvx.f @@ -71,7 +71,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -277,10 +277,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/sposvxx.f b/lapack-netlib/SRC/sposvxx.f index 08adf1d58..19e599f64 100644 --- a/lapack-netlib/SRC/sposvxx.f +++ b/lapack-netlib/SRC/sposvxx.f @@ -88,7 +88,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A (see argument RCOND). If the reciprocal of the condition number diff --git a/lapack-netlib/SRC/spotf2.f b/lapack-netlib/SRC/spotf2.f index 5b3504834..773b768b2 100644 --- a/lapack-netlib/SRC/spotf2.f +++ b/lapack-netlib/SRC/spotf2.f @@ -89,8 +89,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/spotrf.f b/lapack-netlib/SRC/spotrf.f index 5d5771c86..12ef58a40 100644 --- a/lapack-netlib/SRC/spotrf.f +++ b/lapack-netlib/SRC/spotrf.f @@ -87,8 +87,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/spotrf2.f b/lapack-netlib/SRC/spotrf2.f index ae0484ce1..ef731ffa2 100644 --- a/lapack-netlib/SRC/spotrf2.f +++ b/lapack-netlib/SRC/spotrf2.f @@ -86,8 +86,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/sppsv.f b/lapack-netlib/SRC/sppsv.f index 2d3fb3d91..1f48dd4db 100644 --- a/lapack-netlib/SRC/sppsv.f +++ b/lapack-netlib/SRC/sppsv.f @@ -104,9 +104,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/sppsvx.f b/lapack-netlib/SRC/sppsvx.f index 7d71efcd5..bd2da20ee 100644 --- a/lapack-netlib/SRC/sppsvx.f +++ b/lapack-netlib/SRC/sppsvx.f @@ -69,7 +69,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -262,10 +262,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/spptrf.f b/lapack-netlib/SRC/spptrf.f index 0f90399cc..be51c4824 100644 --- a/lapack-netlib/SRC/spptrf.f +++ b/lapack-netlib/SRC/spptrf.f @@ -79,8 +79,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/spteqr.f b/lapack-netlib/SRC/spteqr.f index 0d6bf911b..6e6e9aa45 100644 --- a/lapack-netlib/SRC/spteqr.f +++ b/lapack-netlib/SRC/spteqr.f @@ -123,8 +123,8 @@ *> < 0: if INFO = -i, the i-th argument had an illegal value. *> > 0: if INFO = i, and i is: *> <= N the Cholesky factorization of the matrix could -*> not be performed because the i-th principal minor -*> was not positive definite. +*> not be performed because the leading principal +*> minor of order i was not positive. *> > N the SVD algorithm failed to converge; *> if INFO = N+i, i off-diagonal elements of the *> bidiagonal factor did not converge to zero. diff --git a/lapack-netlib/SRC/sptsv.f b/lapack-netlib/SRC/sptsv.f index 46aadf4c3..f11c22594 100644 --- a/lapack-netlib/SRC/sptsv.f +++ b/lapack-netlib/SRC/sptsv.f @@ -93,8 +93,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the solution has not been +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the solution has not been *> computed. The factorization has not been completed *> unless i = N. *> \endverbatim diff --git a/lapack-netlib/SRC/sptsvx.f b/lapack-netlib/SRC/sptsvx.f index 8dd04b054..eaa691cef 100644 --- a/lapack-netlib/SRC/sptsvx.f +++ b/lapack-netlib/SRC/sptsvx.f @@ -59,7 +59,7 @@ *> factorization can also be regarded as having the form *> A = U**T*D*U. *> -*> 2. If the leading i-by-i principal minor is not positive definite, +*> 2. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -199,10 +199,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/spttrf.f b/lapack-netlib/SRC/spttrf.f index 2217a7338..e083b7456 100644 --- a/lapack-netlib/SRC/spttrf.f +++ b/lapack-netlib/SRC/spttrf.f @@ -70,8 +70,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite; if k < N, the factorization could not +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive; if k < N, the factorization could not *> be completed, while if k = N, the factorization was *> completed, but D(N) <= 0. *> \endverbatim diff --git a/lapack-netlib/SRC/ssbgvx.f b/lapack-netlib/SRC/ssbgvx.f index 271f35964..3607fae82 100644 --- a/lapack-netlib/SRC/ssbgvx.f +++ b/lapack-netlib/SRC/ssbgvx.f @@ -267,7 +267,7 @@ *> Their indices are stored in IFAIL. *> > N: SPBSTF returned an error code; i.e., *> if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/sspgv.f b/lapack-netlib/SRC/sspgv.f index c73e94e60..e8bc66e5d 100644 --- a/lapack-netlib/SRC/sspgv.f +++ b/lapack-netlib/SRC/sspgv.f @@ -139,7 +139,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero. *> > N: if INFO = n + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f index 73862ed1b..8ce2311fa 100644 --- a/lapack-netlib/SRC/sspgvd.f +++ b/lapack-netlib/SRC/sspgvd.f @@ -184,7 +184,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/sspgvx.f b/lapack-netlib/SRC/sspgvx.f index de581543a..6d5b4ed3d 100644 --- a/lapack-netlib/SRC/sspgvx.f +++ b/lapack-netlib/SRC/sspgvx.f @@ -245,7 +245,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/ssygv.f b/lapack-netlib/SRC/ssygv.f index 270957fce..f39947d92 100644 --- a/lapack-netlib/SRC/ssygv.f +++ b/lapack-netlib/SRC/ssygv.f @@ -154,7 +154,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/ssygv_2stage.f b/lapack-netlib/SRC/ssygv_2stage.f index 49f357d90..3d9a44b5e 100644 --- a/lapack-netlib/SRC/ssygv_2stage.f +++ b/lapack-netlib/SRC/ssygv_2stage.f @@ -173,7 +173,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f index 7c7e0de01..79f12a6f9 100644 --- a/lapack-netlib/SRC/ssygvd.f +++ b/lapack-netlib/SRC/ssygvd.f @@ -190,7 +190,7 @@ *> the submatrix lying in rows and columns INFO/(N+1) *> through mod(INFO,N+1); *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/ssygvx.f b/lapack-netlib/SRC/ssygvx.f index e93da60cc..344075c9f 100644 --- a/lapack-netlib/SRC/ssygvx.f +++ b/lapack-netlib/SRC/ssygvx.f @@ -270,7 +270,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim From a20f533b868a23c96374d7c6f897cf1c76772e8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Jun 2023 16:23:26 +0200 Subject: [PATCH 154/718] Fix confusing use of "minor" in inline documentation (Reference-LAPACK PR849) --- lapack-netlib/SRC/dla_porpvgrw.f | 6 +++--- lapack-netlib/SRC/dpbsv.f | 6 +++--- lapack-netlib/SRC/dpbsvx.f | 10 +++++----- lapack-netlib/SRC/dpbtf2.f | 4 ++-- lapack-netlib/SRC/dpbtrf.f | 4 ++-- lapack-netlib/SRC/dpftrf.f | 4 ++-- lapack-netlib/SRC/dposv.f | 6 +++--- lapack-netlib/SRC/dposvx.f | 10 +++++----- lapack-netlib/SRC/dposvxx.f | 2 +- lapack-netlib/SRC/dpotf2.f | 4 ++-- lapack-netlib/SRC/dpotrf.f | 4 ++-- lapack-netlib/SRC/dpotrf2.f | 4 ++-- lapack-netlib/SRC/dppsv.f | 6 +++--- lapack-netlib/SRC/dppsvx.f | 10 +++++----- lapack-netlib/SRC/dpptrf.f | 4 ++-- lapack-netlib/SRC/dpteqr.f | 4 ++-- lapack-netlib/SRC/dptsv.f | 4 ++-- lapack-netlib/SRC/dptsvx.f | 10 +++++----- lapack-netlib/SRC/dpttrf.f | 4 ++-- lapack-netlib/SRC/dsbgvx.f | 2 +- lapack-netlib/SRC/dspgv.f | 2 +- lapack-netlib/SRC/dspgvd.f | 2 +- lapack-netlib/SRC/dspgvx.f | 2 +- lapack-netlib/SRC/dsposv.f | 4 ++-- lapack-netlib/SRC/dsygv.f | 2 +- lapack-netlib/SRC/dsygv_2stage.f | 2 +- lapack-netlib/SRC/dsygvd.f | 2 +- lapack-netlib/SRC/dsygvx.f | 2 +- 28 files changed, 63 insertions(+), 63 deletions(-) diff --git a/lapack-netlib/SRC/dla_porpvgrw.f b/lapack-netlib/SRC/dla_porpvgrw.f index 93ad3eb6a..00fdd7ae1 100644 --- a/lapack-netlib/SRC/dla_porpvgrw.f +++ b/lapack-netlib/SRC/dla_porpvgrw.f @@ -134,9 +134,9 @@ * UPPER = LSAME( 'Upper', UPLO ) * -* DPOTRF will have factored only the NCOLSxNCOLS leading minor, so -* we restrict the growth search to that minor and use only the first -* 2*NCOLS workspace entries. +* DPOTRF will have factored only the NCOLSxNCOLS leading submatrix, +* so we restrict the growth search to that submatrix and use only +* the first 2*NCOLS workspace entries. * RPVGRW = 1.0D+0 DO I = 1, 2*NCOLS diff --git a/lapack-netlib/SRC/dpbsv.f b/lapack-netlib/SRC/dpbsv.f index a52e78309..2d8f06440 100644 --- a/lapack-netlib/SRC/dpbsv.f +++ b/lapack-netlib/SRC/dpbsv.f @@ -119,9 +119,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dpbsvx.f b/lapack-netlib/SRC/dpbsvx.f index 1bf526fc7..142dda5da 100644 --- a/lapack-netlib/SRC/dpbsvx.f +++ b/lapack-netlib/SRC/dpbsvx.f @@ -71,7 +71,7 @@ *> where U is an upper triangular band matrix, and L is a lower *> triangular band matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -281,10 +281,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/dpbtf2.f b/lapack-netlib/SRC/dpbtf2.f index 534629294..1284c9ec8 100644 --- a/lapack-netlib/SRC/dpbtf2.f +++ b/lapack-netlib/SRC/dpbtf2.f @@ -97,8 +97,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dpbtrf.f b/lapack-netlib/SRC/dpbtrf.f index 8256f8938..29e9aaecd 100644 --- a/lapack-netlib/SRC/dpbtrf.f +++ b/lapack-netlib/SRC/dpbtrf.f @@ -92,8 +92,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dpftrf.f b/lapack-netlib/SRC/dpftrf.f index 980debaf2..312dcdf84 100644 --- a/lapack-netlib/SRC/dpftrf.f +++ b/lapack-netlib/SRC/dpftrf.f @@ -91,8 +91,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dposv.f b/lapack-netlib/SRC/dposv.f index ee2988e6f..cb76e9977 100644 --- a/lapack-netlib/SRC/dposv.f +++ b/lapack-netlib/SRC/dposv.f @@ -110,9 +110,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dposvx.f b/lapack-netlib/SRC/dposvx.f index 4a0b9d605..faffff803 100644 --- a/lapack-netlib/SRC/dposvx.f +++ b/lapack-netlib/SRC/dposvx.f @@ -71,7 +71,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -277,10 +277,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/dposvxx.f b/lapack-netlib/SRC/dposvxx.f index e74b23c80..b5336cf63 100644 --- a/lapack-netlib/SRC/dposvxx.f +++ b/lapack-netlib/SRC/dposvxx.f @@ -88,7 +88,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A (see argument RCOND). If the reciprocal of the condition number diff --git a/lapack-netlib/SRC/dpotf2.f b/lapack-netlib/SRC/dpotf2.f index 08fa4957f..30da5c3f3 100644 --- a/lapack-netlib/SRC/dpotf2.f +++ b/lapack-netlib/SRC/dpotf2.f @@ -89,8 +89,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dpotrf.f b/lapack-netlib/SRC/dpotrf.f index 1679fc3cd..65509feb8 100644 --- a/lapack-netlib/SRC/dpotrf.f +++ b/lapack-netlib/SRC/dpotrf.f @@ -87,8 +87,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dpotrf2.f b/lapack-netlib/SRC/dpotrf2.f index 6c28ce6d6..aaf9b9c58 100644 --- a/lapack-netlib/SRC/dpotrf2.f +++ b/lapack-netlib/SRC/dpotrf2.f @@ -86,8 +86,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dppsv.f b/lapack-netlib/SRC/dppsv.f index 435703b08..1888005d9 100644 --- a/lapack-netlib/SRC/dppsv.f +++ b/lapack-netlib/SRC/dppsv.f @@ -104,9 +104,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dppsvx.f b/lapack-netlib/SRC/dppsvx.f index cb41d39ee..3b08fc821 100644 --- a/lapack-netlib/SRC/dppsvx.f +++ b/lapack-netlib/SRC/dppsvx.f @@ -69,7 +69,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -262,10 +262,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/dpptrf.f b/lapack-netlib/SRC/dpptrf.f index 2d8de5110..d9eed910d 100644 --- a/lapack-netlib/SRC/dpptrf.f +++ b/lapack-netlib/SRC/dpptrf.f @@ -79,8 +79,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/dpteqr.f b/lapack-netlib/SRC/dpteqr.f index aa1f1a80c..d07b065b0 100644 --- a/lapack-netlib/SRC/dpteqr.f +++ b/lapack-netlib/SRC/dpteqr.f @@ -123,8 +123,8 @@ *> < 0: if INFO = -i, the i-th argument had an illegal value. *> > 0: if INFO = i, and i is: *> <= N the Cholesky factorization of the matrix could -*> not be performed because the i-th principal minor -*> was not positive definite. +*> not be performed because the leading principal +*> minor of order i was not positive. *> > N the SVD algorithm failed to converge; *> if INFO = N+i, i off-diagonal elements of the *> bidiagonal factor did not converge to zero. diff --git a/lapack-netlib/SRC/dptsv.f b/lapack-netlib/SRC/dptsv.f index addc34b88..41d8cff15 100644 --- a/lapack-netlib/SRC/dptsv.f +++ b/lapack-netlib/SRC/dptsv.f @@ -93,8 +93,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the solution has not been +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the solution has not been *> computed. The factorization has not been completed *> unless i = N. *> \endverbatim diff --git a/lapack-netlib/SRC/dptsvx.f b/lapack-netlib/SRC/dptsvx.f index 7fb6cf436..fcbf5aa3b 100644 --- a/lapack-netlib/SRC/dptsvx.f +++ b/lapack-netlib/SRC/dptsvx.f @@ -59,7 +59,7 @@ *> factorization can also be regarded as having the form *> A = U**T*D*U. *> -*> 2. If the leading i-by-i principal minor is not positive definite, +*> 2. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -199,10 +199,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/dpttrf.f b/lapack-netlib/SRC/dpttrf.f index e0022e3ad..156e845a3 100644 --- a/lapack-netlib/SRC/dpttrf.f +++ b/lapack-netlib/SRC/dpttrf.f @@ -70,8 +70,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite; if k < N, the factorization could not +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive; if k < N, the factorization could not *> be completed, while if k = N, the factorization was *> completed, but D(N) <= 0. *> \endverbatim diff --git a/lapack-netlib/SRC/dsbgvx.f b/lapack-netlib/SRC/dsbgvx.f index 20de17931..53deae580 100644 --- a/lapack-netlib/SRC/dsbgvx.f +++ b/lapack-netlib/SRC/dsbgvx.f @@ -267,7 +267,7 @@ *> Their indices are stored in IFAIL. *> > N: DPBSTF returned an error code; i.e., *> if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dspgv.f b/lapack-netlib/SRC/dspgv.f index d8ec3b1a4..96041c301 100644 --- a/lapack-netlib/SRC/dspgv.f +++ b/lapack-netlib/SRC/dspgv.f @@ -139,7 +139,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero. *> > N: if INFO = n + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f index df215ae1a..ec3cdc1ac 100644 --- a/lapack-netlib/SRC/dspgvd.f +++ b/lapack-netlib/SRC/dspgvd.f @@ -184,7 +184,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dspgvx.f b/lapack-netlib/SRC/dspgvx.f index ec93147aa..5afd73d02 100644 --- a/lapack-netlib/SRC/dspgvx.f +++ b/lapack-netlib/SRC/dspgvx.f @@ -245,7 +245,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dsposv.f b/lapack-netlib/SRC/dsposv.f index c3b8de0e3..0bd75698d 100644 --- a/lapack-netlib/SRC/dsposv.f +++ b/lapack-netlib/SRC/dsposv.f @@ -177,8 +177,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of (DOUBLE -*> PRECISION) A is not positive definite, so the +*> > 0: if INFO = i, the leading principal minor of order i +*> of (DOUBLE PRECISION) A is not positive, so the *> factorization could not be completed, and the solution *> has not been computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dsygv.f b/lapack-netlib/SRC/dsygv.f index 5208dbb1f..02a4cc3ed 100644 --- a/lapack-netlib/SRC/dsygv.f +++ b/lapack-netlib/SRC/dsygv.f @@ -154,7 +154,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dsygv_2stage.f b/lapack-netlib/SRC/dsygv_2stage.f index 5c71ebf94..383304267 100644 --- a/lapack-netlib/SRC/dsygv_2stage.f +++ b/lapack-netlib/SRC/dsygv_2stage.f @@ -173,7 +173,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f index 3b38665a7..d6682d4e5 100644 --- a/lapack-netlib/SRC/dsygvd.f +++ b/lapack-netlib/SRC/dsygvd.f @@ -190,7 +190,7 @@ *> the submatrix lying in rows and columns INFO/(N+1) *> through mod(INFO,N+1); *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/dsygvx.f b/lapack-netlib/SRC/dsygvx.f index 3fa55b97c..2dc27e8a8 100644 --- a/lapack-netlib/SRC/dsygvx.f +++ b/lapack-netlib/SRC/dsygvx.f @@ -270,7 +270,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim From 7b73666d709c7f77379a35d318fdc34bd3818efe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Jun 2023 16:25:24 +0200 Subject: [PATCH 155/718] Fix confusing use of "minor" in inline documentation (Reference-LAPACK PR849) --- lapack-netlib/SRC/chegv.f | 2 +- lapack-netlib/SRC/chegv_2stage.f | 2 +- lapack-netlib/SRC/chegvd.f | 2 +- lapack-netlib/SRC/chegvx.f | 2 +- lapack-netlib/SRC/chpgv.f | 2 +- lapack-netlib/SRC/chpgvd.f | 2 +- lapack-netlib/SRC/chpgvx.f | 2 +- lapack-netlib/SRC/cla_porpvgrw.f | 6 +++--- lapack-netlib/SRC/cpbsv.f | 6 +++--- lapack-netlib/SRC/cpbsvx.f | 10 +++++----- lapack-netlib/SRC/cpbtf2.f | 4 ++-- lapack-netlib/SRC/cpbtrf.f | 4 ++-- lapack-netlib/SRC/cpftrf.f | 4 ++-- lapack-netlib/SRC/cposv.f | 6 +++--- lapack-netlib/SRC/cposvx.f | 10 +++++----- lapack-netlib/SRC/cposvxx.f | 2 +- lapack-netlib/SRC/cpotf2.f | 4 ++-- lapack-netlib/SRC/cpotrf.f | 4 ++-- lapack-netlib/SRC/cpotrf2.f | 4 ++-- lapack-netlib/SRC/cppsv.f | 6 +++--- lapack-netlib/SRC/cppsvx.f | 10 +++++----- lapack-netlib/SRC/cpptrf.f | 6 +++--- lapack-netlib/SRC/cpteqr.f | 4 ++-- lapack-netlib/SRC/cptsv.f | 4 ++-- lapack-netlib/SRC/cptsvx.f | 10 +++++----- lapack-netlib/SRC/cpttrf.f | 4 ++-- 26 files changed, 61 insertions(+), 61 deletions(-) diff --git a/lapack-netlib/SRC/chegv.f b/lapack-netlib/SRC/chegv.f index f7675a19f..198e5d102 100644 --- a/lapack-netlib/SRC/chegv.f +++ b/lapack-netlib/SRC/chegv.f @@ -160,7 +160,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/chegv_2stage.f b/lapack-netlib/SRC/chegv_2stage.f index 472581c4b..d2b8fc795 100644 --- a/lapack-netlib/SRC/chegv_2stage.f +++ b/lapack-netlib/SRC/chegv_2stage.f @@ -179,7 +179,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not converge to zero; *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f index 4b7f43d52..c96f011af 100644 --- a/lapack-netlib/SRC/chegvd.f +++ b/lapack-netlib/SRC/chegvd.f @@ -212,7 +212,7 @@ *> the submatrix lying in rows and columns INFO/(N+1) *> through mod(INFO,N+1); *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/chegvx.f b/lapack-netlib/SRC/chegvx.f index 6e428242d..8e565222d 100644 --- a/lapack-netlib/SRC/chegvx.f +++ b/lapack-netlib/SRC/chegvx.f @@ -280,7 +280,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= N, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/chpgv.f b/lapack-netlib/SRC/chpgv.f index 417f10121..660724e05 100644 --- a/lapack-netlib/SRC/chpgv.f +++ b/lapack-netlib/SRC/chpgv.f @@ -144,7 +144,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not convergeto zero; *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f index 65d08b783..5c9e417d3 100644 --- a/lapack-netlib/SRC/chpgvd.f +++ b/lapack-netlib/SRC/chpgvd.f @@ -205,7 +205,7 @@ *> i off-diagonal elements of an intermediate *> tridiagonal form did not convergeto zero; *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/chpgvx.f b/lapack-netlib/SRC/chpgvx.f index 711daf55f..2646800cc 100644 --- a/lapack-netlib/SRC/chpgvx.f +++ b/lapack-netlib/SRC/chpgvx.f @@ -250,7 +250,7 @@ *> i eigenvectors failed to converge. Their indices *> are stored in array IFAIL. *> > N: if INFO = N + i, for 1 <= i <= n, then the leading -*> minor of order i of B is not positive definite. +*> principal minor of order i of B is not positive. *> The factorization of B could not be completed and *> no eigenvalues or eigenvectors were computed. *> \endverbatim diff --git a/lapack-netlib/SRC/cla_porpvgrw.f b/lapack-netlib/SRC/cla_porpvgrw.f index 1eb706d1a..78cd19da9 100644 --- a/lapack-netlib/SRC/cla_porpvgrw.f +++ b/lapack-netlib/SRC/cla_porpvgrw.f @@ -140,9 +140,9 @@ * .. Executable Statements .. UPPER = LSAME( 'Upper', UPLO ) * -* SPOTRF will have factored only the NCOLSxNCOLS leading minor, so -* we restrict the growth search to that minor and use only the first -* 2*NCOLS workspace entries. +* SPOTRF will have factored only the NCOLSxNCOLS leading submatrix, +* so we restrict the growth search to that submatrix and use only +* the first 2*NCOLS workspace entries. * RPVGRW = 1.0 DO I = 1, 2*NCOLS diff --git a/lapack-netlib/SRC/cpbsv.f b/lapack-netlib/SRC/cpbsv.f index 248abbc1f..889bbde08 100644 --- a/lapack-netlib/SRC/cpbsv.f +++ b/lapack-netlib/SRC/cpbsv.f @@ -119,9 +119,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/cpbsvx.f b/lapack-netlib/SRC/cpbsvx.f index 652e18501..975c87768 100644 --- a/lapack-netlib/SRC/cpbsvx.f +++ b/lapack-netlib/SRC/cpbsvx.f @@ -70,7 +70,7 @@ *> where U is an upper triangular band matrix, and L is a lower *> triangular band matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -280,10 +280,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/cpbtf2.f b/lapack-netlib/SRC/cpbtf2.f index 0be2c0a7f..f5bc9b3a6 100644 --- a/lapack-netlib/SRC/cpbtf2.f +++ b/lapack-netlib/SRC/cpbtf2.f @@ -97,8 +97,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/cpbtrf.f b/lapack-netlib/SRC/cpbtrf.f index a4c18efb3..af60780c8 100644 --- a/lapack-netlib/SRC/cpbtrf.f +++ b/lapack-netlib/SRC/cpbtrf.f @@ -92,8 +92,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/cpftrf.f b/lapack-netlib/SRC/cpftrf.f index cbaab6832..12799c6f7 100644 --- a/lapack-netlib/SRC/cpftrf.f +++ b/lapack-netlib/SRC/cpftrf.f @@ -91,8 +91,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> *> Further Notes on RFP Format: diff --git a/lapack-netlib/SRC/cposv.f b/lapack-netlib/SRC/cposv.f index f37dfa3c0..ea6fc37db 100644 --- a/lapack-netlib/SRC/cposv.f +++ b/lapack-netlib/SRC/cposv.f @@ -110,9 +110,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/cposvx.f b/lapack-netlib/SRC/cposvx.f index 78b9f4db1..322a26447 100644 --- a/lapack-netlib/SRC/cposvx.f +++ b/lapack-netlib/SRC/cposvx.f @@ -70,7 +70,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -276,10 +276,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/cposvxx.f b/lapack-netlib/SRC/cposvxx.f index 7834c75da..c40a2d856 100644 --- a/lapack-netlib/SRC/cposvxx.f +++ b/lapack-netlib/SRC/cposvxx.f @@ -87,7 +87,7 @@ *> where U is an upper triangular matrix and L is a lower triangular *> matrix. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A (see argument RCOND). If the reciprocal of the condition number diff --git a/lapack-netlib/SRC/cpotf2.f b/lapack-netlib/SRC/cpotf2.f index 2f4658bae..d84988949 100644 --- a/lapack-netlib/SRC/cpotf2.f +++ b/lapack-netlib/SRC/cpotf2.f @@ -89,8 +89,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/cpotrf.f b/lapack-netlib/SRC/cpotrf.f index 6aba3103e..e2b120a49 100644 --- a/lapack-netlib/SRC/cpotrf.f +++ b/lapack-netlib/SRC/cpotrf.f @@ -87,8 +87,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/cpotrf2.f b/lapack-netlib/SRC/cpotrf2.f index e1eae3e9d..ea2e4ca98 100644 --- a/lapack-netlib/SRC/cpotrf2.f +++ b/lapack-netlib/SRC/cpotrf2.f @@ -86,8 +86,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the factorization could not be *> completed. *> \endverbatim * diff --git a/lapack-netlib/SRC/cppsv.f b/lapack-netlib/SRC/cppsv.f index 1e6f02695..a8fd660c4 100644 --- a/lapack-netlib/SRC/cppsv.f +++ b/lapack-netlib/SRC/cppsv.f @@ -104,9 +104,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i of A is not -*> positive definite, so the factorization could not be -*> completed, and the solution has not been computed. +*> > 0: if INFO = i, the leading principal minor of order i +*> of A is not positive, so the factorization could not +*> be completed, and the solution has not been computed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/cppsvx.f b/lapack-netlib/SRC/cppsvx.f index f6f07538c..2ef02100f 100644 --- a/lapack-netlib/SRC/cppsvx.f +++ b/lapack-netlib/SRC/cppsvx.f @@ -69,7 +69,7 @@ *> where U is an upper triangular matrix, L is a lower triangular *> matrix, and **H indicates conjugate transpose. *> -*> 3. If the leading i-by-i principal minor is not positive definite, +*> 3. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -262,10 +262,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/cpptrf.f b/lapack-netlib/SRC/cpptrf.f index 4e81458cb..e36f834cb 100644 --- a/lapack-netlib/SRC/cpptrf.f +++ b/lapack-netlib/SRC/cpptrf.f @@ -79,9 +79,9 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the factorization could not be -*> completed. +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive definite, and the factorization could +*> not be completed. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/cpteqr.f b/lapack-netlib/SRC/cpteqr.f index e3af59041..fc9c44908 100644 --- a/lapack-netlib/SRC/cpteqr.f +++ b/lapack-netlib/SRC/cpteqr.f @@ -123,8 +123,8 @@ *> < 0: if INFO = -i, the i-th argument had an illegal value. *> > 0: if INFO = i, and i is: *> <= N the Cholesky factorization of the matrix could -*> not be performed because the i-th principal minor -*> was not positive definite. +*> not be performed because the leading principal +*> minor of order i was not positive. *> > N the SVD algorithm failed to converge; *> if INFO = N+i, i off-diagonal elements of the *> bidiagonal factor did not converge to zero. diff --git a/lapack-netlib/SRC/cptsv.f b/lapack-netlib/SRC/cptsv.f index 20ee32bbe..4c16f6a0a 100644 --- a/lapack-netlib/SRC/cptsv.f +++ b/lapack-netlib/SRC/cptsv.f @@ -94,8 +94,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, the leading minor of order i is not -*> positive definite, and the solution has not been +*> > 0: if INFO = i, the leading principal minor of order i +*> is not positive, and the solution has not been *> computed. The factorization has not been completed *> unless i = N. *> \endverbatim diff --git a/lapack-netlib/SRC/cptsvx.f b/lapack-netlib/SRC/cptsvx.f index db63a3c36..6f7d8cf5b 100644 --- a/lapack-netlib/SRC/cptsvx.f +++ b/lapack-netlib/SRC/cptsvx.f @@ -60,7 +60,7 @@ *> factorization can also be regarded as having the form *> A = U**H*D*U. *> -*> 2. If the leading i-by-i principal minor is not positive definite, +*> 2. If the leading principal minor of order i is not positive, *> then the routine returns with INFO = i. Otherwise, the factored *> form of A is used to estimate the condition number of the matrix *> A. If the reciprocal of the condition number is less than machine @@ -205,10 +205,10 @@ *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value *> > 0: if INFO = i, and i is -*> <= N: the leading minor of order i of A is -*> not positive definite, so the factorization -*> could not be completed, and the solution has not -*> been computed. RCOND = 0 is returned. +*> <= N: the leading principal minor of order i of A +*> is not positive, so the factorization could not +*> be completed, and the solution has not been +*> computed. RCOND = 0 is returned. *> = N+1: U is nonsingular, but RCOND is less than machine *> precision, meaning that the matrix is singular *> to working precision. Nevertheless, the diff --git a/lapack-netlib/SRC/cpttrf.f b/lapack-netlib/SRC/cpttrf.f index c6379e923..111343b78 100644 --- a/lapack-netlib/SRC/cpttrf.f +++ b/lapack-netlib/SRC/cpttrf.f @@ -71,8 +71,8 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -k, the k-th argument had an illegal value -*> > 0: if INFO = k, the leading minor of order k is not -*> positive definite; if k < N, the factorization could not +*> > 0: if INFO = k, the leading principal minor of order k +*> is not positive; if k < N, the factorization could not *> be completed, while if k = N, the factorization was *> completed, but D(N) <= 0. *> \endverbatim From d301649430be60a529d978318a1ffd62e34141f6 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Fri, 23 Jun 2023 19:42:27 -0700 Subject: [PATCH 156/718] fix #4063 threading perf issues on Windows --- driver/others/blas_server_win32.c | 99 +++++++++++++------------------ 1 file changed, 40 insertions(+), 59 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index afa33cccc..1d747bd9f 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -52,9 +52,7 @@ /* Thread server common information */ typedef struct{ - CRITICAL_SECTION lock; - HANDLE filled; - HANDLE killed; + HANDLE taskSemaphore; blas_queue_t *queue; /* Parameter Pointer */ int shutdown; /* server shutdown flag */ @@ -68,6 +66,7 @@ int blas_server_avail = 0; static BLASULONG server_lock = 0; static blas_pool_t pool; +static BLASULONG pool_lock = 0; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; @@ -198,7 +197,6 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* This is a main routine of threads. Each thread waits until job is */ /* queued. */ - static DWORD WINAPI blas_thread_server(void *arg){ /* Thread identifier */ @@ -207,9 +205,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ #endif void *buffer, *sa, *sb; - blas_queue_t *queue; - DWORD action; - HANDLE handles[] = {pool.filled, pool.killed}; + volatile blas_queue_t *queue; /* Each server needs each buffer */ buffer = blas_memory_alloc(2); @@ -226,28 +222,32 @@ static DWORD WINAPI blas_thread_server(void *arg){ fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); #endif - do { - action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); - } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); - - if (action == WAIT_OBJECT_0 + 1) break; + // all worker threads wait on the semaphore + WaitForSingleObject(pool.taskSemaphore, INFINITE); + // kill the thread if we are shutting down the server + if (pool.shutdown) + break; + #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); #endif - EnterCriticalSection(&pool.lock); + // grab a queued task and update the list + volatile blas_queue_t* queue_next; + LONG64 prev_value; + do { + queue = (volatile blas_queue_t*)pool.queue; + if (!queue) + break; - queue = pool.queue; - if (queue) pool.queue = queue->next; - - LeaveCriticalSection(&pool.lock); + queue_next = (volatile blas_queue_t*)queue->next; + prev_value = InterlockedCompareExchange64((PLONG64)&pool.queue, (LONG64)queue_next, (LONG64)queue); + } while (prev_value != queue); if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; - if (pool.queue) SetEvent(pool.filled); - sa = queue -> sa; sb = queue -> sb; @@ -332,13 +332,8 @@ static DWORD WINAPI blas_thread_server(void *arg){ fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); #endif - EnterCriticalSection(&queue->lock); - - queue -> status = BLAS_STATUS_FINISHED; - - LeaveCriticalSection(&queue->lock); - - SetEvent(queue->finish); + // mark our sub-task as complete + InterlockedDecrement(&queue->status); } /* Shutdown procedure */ @@ -353,7 +348,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ } /* Initializing routine */ -int blas_thread_init(void){ + int blas_thread_init(void){ BLASLONG i; if (blas_server_avail || (blas_cpu_number <= 1)) return 0; @@ -367,9 +362,7 @@ int blas_thread_init(void){ if (!blas_server_avail){ - InitializeCriticalSection(&pool.lock); - pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); - pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + pool.taskSemaphore = CreateSemaphore(NULL, 0, blas_cpu_number - 1, NULL); pool.shutdown = 0; pool.queue = NULL; @@ -391,11 +384,10 @@ int blas_thread_init(void){ /* User can call one of two routines. - exec_blas_async ... immediately returns after jobs are queued. + exec_blas_async ... immediately returns after jobs are queued. - exec_blas ... returns after jobs are finished. + exec_blas ... returns after jobs are finished. */ - int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ #if defined(SMP_SERVER) @@ -409,8 +401,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ current = queue; while (current) { - InitializeCriticalSection(¤t -> lock); - current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL); + current->status = 1; current -> position = pos; #ifdef CONSISTENT_FPCSR @@ -422,19 +413,10 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ pos ++; } - EnterCriticalSection(&pool.lock); + pool.queue = queue; - if (pool.queue) { - current = pool.queue; - while (current -> next) current = current -> next; - current -> next = queue; - } else { - pool.queue = queue; - } - - LeaveCriticalSection(&pool.lock); - - SetEvent(pool.filled); + // start up worker threads + ReleaseSemaphore(pool.taskSemaphore, pos - 1, NULL); return 0; } @@ -450,10 +432,9 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ fprintf(STDERR, "Waiting Queue ..\n"); #endif - WaitForSingleObject(queue->finish, INFINITE); - - CloseHandle(queue->finish); - DeleteCriticalSection(&queue -> lock); + // spin-wait on each sub-task to finish + while (*((volatile int*)&queue->status)) + YIELDING; queue = queue -> next; num --; @@ -501,18 +482,21 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ /* Shutdown procedure, but user don't have to call this routine. The */ /* kernel automatically kill threads. */ - int BLASFUNC(blas_thread_shutdown)(void){ int i; +#ifdef SMP_DEBUG + fprintf(STDERR, "blas_thread_shutdown..\n"); +#endif + if (!blas_server_avail) return 0; LOCK_COMMAND(&server_lock); if (blas_server_avail){ - SetEvent(pool.killed); + pool.shutdown = 1; for(i = 0; i < blas_num_threads - 1; i++){ // Could also just use WaitForMultipleObjects @@ -528,8 +512,7 @@ int BLASFUNC(blas_thread_shutdown)(void){ CloseHandle(blas_threads[i]); } - CloseHandle(pool.filled); - CloseHandle(pool.killed); + CloseHandle(pool.taskSemaphore); blas_server_avail = 0; } @@ -559,16 +542,14 @@ void goto_set_num_threads(int num_threads) //increased_threads = 1; if (!blas_server_avail){ - InitializeCriticalSection(&pool.lock); - pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); - pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + pool.taskSemaphore = CreateSemaphore(NULL, 0, blas_cpu_number - 1, NULL); pool.shutdown = 0; pool.queue = NULL; blas_server_avail = 1; } - for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + for(i = blas_num_threads; i < num_threads - 1; i++){ blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, From 8caabc5982153f7be980410e95f87749bc04c70f Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Fri, 23 Jun 2023 19:45:16 -0700 Subject: [PATCH 157/718] fix #4063 remove unused pool_lock --- driver/others/blas_server_win32.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 1d747bd9f..3fdb86ec3 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -66,12 +66,9 @@ int blas_server_avail = 0; static BLASULONG server_lock = 0; static blas_pool_t pool; -static BLASULONG pool_lock = 0; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; - - static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ From 427f9f2428eb892745591394f153a012f2d35c2f Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Fri, 23 Jun 2023 22:15:39 -0700 Subject: [PATCH 158/718] update contributors --- CONTRIBUTORS.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 71df13634..7efc04092 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -216,3 +216,8 @@ In chronological order: * Pablo Romero * [2022-08] Fix building from sources for QNX + +* Mark Seminatore + * [2023-06-23] Fix bounds issue in goto_set_num_threads + * [2023-06-23] Improve Windows threading performance scaling + \ No newline at end of file From 7783a9af020aa0cbacd1e6d315c2594a17b71961 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Sat, 24 Jun 2023 14:35:11 -0700 Subject: [PATCH 159/718] attempt to fix old mingw gcc issue --- driver/others/blas_server_win32.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 3fdb86ec3..25879250f 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -50,6 +50,15 @@ /* This is a thread implementation for Win32 lazy implementation */ +#if defined (__GNUC__) && (__GNUC__ < 6) + #define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) +#else + #if defined(_WIN64) + #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) + #else + #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) +#endif + /* Thread server common information */ typedef struct{ HANDLE taskSemaphore; @@ -232,14 +241,14 @@ static DWORD WINAPI blas_thread_server(void *arg){ // grab a queued task and update the list volatile blas_queue_t* queue_next; - LONG64 prev_value; + INT_PTR prev_value; do { queue = (volatile blas_queue_t*)pool.queue; if (!queue) break; queue_next = (volatile blas_queue_t*)queue->next; - prev_value = InterlockedCompareExchange64((PLONG64)&pool.queue, (LONG64)queue_next, (LONG64)queue); + prev_value = WIN_CAS((INT_PTR*)&pool.queue, (INT_PTR)queue_next, (INT_PTR)queue); } while (prev_value != queue); if (queue) { From d6991dd230d5ff33bd86708762de8da05ebc0636 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Sat, 24 Jun 2023 15:43:32 -0700 Subject: [PATCH 160/718] fix missing #endif --- driver/others/blas_server_win32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 25879250f..0b213bf2c 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -57,6 +57,7 @@ #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) #else #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) + #endif #endif /* Thread server common information */ From 572e482b38fcb18ae25e46183aa4c956e1c7d48b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Jun 2023 12:42:05 +0200 Subject: [PATCH 161/718] Fix inline documentation of S/DLASD0 (Reference-LAPACK PR 855) --- lapack-netlib/SRC/dlasd0.f | 10 ++++++---- lapack-netlib/SRC/slasd0.f | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/dlasd0.f b/lapack-netlib/SRC/dlasd0.f index 215dc8717..6d8d4e2db 100644 --- a/lapack-netlib/SRC/dlasd0.f +++ b/lapack-netlib/SRC/dlasd0.f @@ -79,10 +79,11 @@ *> On exit, E has been destroyed. *> \endverbatim *> -*> \param[out] U +*> \param[in,out] U *> \verbatim *> U is DOUBLE PRECISION array, dimension (LDU, N) -*> On exit, U contains the left singular vectors. +*> On exit, U contains the left singular vectors, +*> if U passed in as (N, N) Identity. *> \endverbatim *> *> \param[in] LDU @@ -91,10 +92,11 @@ *> On entry, leading dimension of U. *> \endverbatim *> -*> \param[out] VT +*> \param[in,out] VT *> \verbatim *> VT is DOUBLE PRECISION array, dimension (LDVT, M) -*> On exit, VT**T contains the right singular vectors. +*> On exit, VT**T contains the right singular vectors, +*> if VT passed in as (M, M) Identity. *> \endverbatim *> *> \param[in] LDVT diff --git a/lapack-netlib/SRC/slasd0.f b/lapack-netlib/SRC/slasd0.f index a45f741a9..c8a5c7838 100644 --- a/lapack-netlib/SRC/slasd0.f +++ b/lapack-netlib/SRC/slasd0.f @@ -79,10 +79,11 @@ *> On exit, E has been destroyed. *> \endverbatim *> -*> \param[out] U +*> \param[in,out] U *> \verbatim *> U is REAL array, dimension (LDU, N) -*> On exit, U contains the left singular vectors. +*> On exit, U contains the left singular vectors, +*> if U passed in as (N, N) Identity. *> \endverbatim *> *> \param[in] LDU @@ -91,10 +92,11 @@ *> On entry, leading dimension of U. *> \endverbatim *> -*> \param[out] VT +*> \param[in,out] VT *> \verbatim *> VT is REAL array, dimension (LDVT, M) -*> On exit, VT**T contains the right singular vectors. +*> On exit, VT**T contains the right singular vectors, +*> if VT passed in as (M, M) Identity. *> \endverbatim *> *> \param[in] LDVT From 1fe96f8da788fe58351c482625d55e058db61974 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Jun 2023 22:36:57 +0200 Subject: [PATCH 162/718] Fix failures to handle increments of zero --- kernel/riscv64/rot_vector.c | 1 + kernel/riscv64/swap_vector.c | 1 + kernel/riscv64/zrot_vector.c | 1 + kernel/riscv64/zswap_vector.c | 1 + 4 files changed, 4 insertions(+) diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c index 43a65e552..f3786e1d0 100644 --- a/kernel/riscv64/rot_vector.c +++ b/kernel/riscv64/rot_vector.c @@ -155,6 +155,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } }else{ gvl = VSETVL(n); + if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1); BLASLONG stride_x = inc_x * sizeof(FLOAT); BLASLONG stride_y = inc_y * sizeof(FLOAT); BLASLONG inc_xv = inc_x * gvl; diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index b16592808..82fa5ce31 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -136,6 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, } }else{ gvl = VSETVL(n); + if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1); stride_x = inc_x * sizeof(FLOAT); stride_y = inc_y * sizeof(FLOAT); if(gvl <= n/2){ diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c index 858dfd173..727d13a87 100644 --- a/kernel/riscv64/zrot_vector.c +++ b/kernel/riscv64/zrot_vector.c @@ -112,6 +112,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } }else{ + if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1); for(i=0,j=0; i < n/gvl; i++){ vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c index c1dcaccab..09cc8992a 100644 --- a/kernel/riscv64/zswap_vector.c +++ b/kernel/riscv64/zswap_vector.c @@ -81,6 +81,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm } }else{ gvl = VSETVL(n); + if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1); stride_x = inc_x * 2 * sizeof(FLOAT); stride_y = inc_y * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * gvl * 2; From b13787971e108d4a267feae478eb253ad7781c9e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Jun 2023 22:37:56 +0200 Subject: [PATCH 163/718] Enforce -O1 to work around miscompilation by the vendor gcc --- Makefile.riscv64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce91e03ec..0246c0f7d 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -1,4 +1,4 @@ ifeq ($(CORE), C910V) -CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 +CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -O1 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif From 329bd3410b5ba48310b035fd75d1dd445413bc3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Jun 2023 15:44:10 +0200 Subject: [PATCH 164/718] Remove legacy warning comments and rename variable LAMDA to LAMBDA (Reference-LAPACK PR 852) --- lapack-netlib/SRC/cgelsd.f | 6 ---- lapack-netlib/SRC/cgesdd.f | 6 ---- lapack-netlib/SRC/chbevd.f | 6 ---- lapack-netlib/SRC/chbevd_2stage.f | 6 ---- lapack-netlib/SRC/chbgvd.f | 6 ---- lapack-netlib/SRC/cheevd.f | 6 ---- lapack-netlib/SRC/cheevd_2stage.f | 6 ---- lapack-netlib/SRC/chegvd.f | 6 ---- lapack-netlib/SRC/chpevd.f | 6 ---- lapack-netlib/SRC/chpgvd.f | 6 ---- lapack-netlib/SRC/claed8.f | 30 ++++++++--------- lapack-netlib/SRC/clals0.f | 10 ++++++ lapack-netlib/SRC/clalsd.f | 6 ---- lapack-netlib/SRC/cstedc.f | 6 ---- lapack-netlib/SRC/dbdsdc.f | 7 ---- lapack-netlib/SRC/dgelsd.f | 6 ---- lapack-netlib/SRC/dgesdd.f | 6 ---- lapack-netlib/SRC/dlaed2.f | 30 ++++++++--------- lapack-netlib/SRC/dlaed3.f | 52 +++++++----------------------- lapack-netlib/SRC/dlaed8.f | 34 ++++++++++---------- lapack-netlib/SRC/dlaed9.f | 47 ++++++++------------------- lapack-netlib/SRC/dlals0.f | 10 ++++++ lapack-netlib/SRC/dlalsd.f | 6 ---- lapack-netlib/SRC/dlas2.f | 4 +-- lapack-netlib/SRC/dlasd3.f | 34 ++------------------ lapack-netlib/SRC/dlasd8.f | 30 ++++------------- lapack-netlib/SRC/dlasv2.f | 4 +-- lapack-netlib/SRC/dsbevd.f | 6 ---- lapack-netlib/SRC/dsbevd_2stage.f | 6 ---- lapack-netlib/SRC/dsbgvd.f | 6 ---- lapack-netlib/SRC/dspevd.f | 6 ---- lapack-netlib/SRC/dspgvd.f | 6 ---- lapack-netlib/SRC/dstedc.f | 6 ---- lapack-netlib/SRC/dstevd.f | 6 ---- lapack-netlib/SRC/dsyevd.f | 7 ---- lapack-netlib/SRC/dsyevd_2stage.f | 6 ---- lapack-netlib/SRC/dsygvd.f | 6 ---- lapack-netlib/SRC/sbdsdc.f | 7 ---- lapack-netlib/SRC/sgelsd.f | 6 ---- lapack-netlib/SRC/sgesdd.f | 6 ---- lapack-netlib/SRC/slaed2.f | 30 ++++++++--------- lapack-netlib/SRC/slaed3.f | 53 +++++++------------------------ lapack-netlib/SRC/slaed8.f | 34 ++++++++++---------- lapack-netlib/SRC/slaed9.f | 47 ++++++++------------------- lapack-netlib/SRC/slals0.f | 10 ++++++ lapack-netlib/SRC/slalsd.f | 6 ---- lapack-netlib/SRC/slas2.f | 4 +-- lapack-netlib/SRC/slasd3.f | 34 ++------------------ lapack-netlib/SRC/slasd8.f | 30 ++++------------- lapack-netlib/SRC/slasv2.f | 4 +-- lapack-netlib/SRC/ssbevd.f | 6 ---- lapack-netlib/SRC/ssbevd_2stage.f | 6 ---- lapack-netlib/SRC/ssbgvd.f | 6 ---- lapack-netlib/SRC/sspevd.f | 6 ---- lapack-netlib/SRC/sspgvd.f | 6 ---- lapack-netlib/SRC/sstedc.f | 6 ---- lapack-netlib/SRC/sstevd.f | 6 ---- lapack-netlib/SRC/ssyevd.f | 7 ---- lapack-netlib/SRC/ssyevd_2stage.f | 6 ---- lapack-netlib/SRC/ssygvd.f | 6 ---- lapack-netlib/SRC/zgelsd.f | 6 ---- lapack-netlib/SRC/zgesdd.f | 6 ---- lapack-netlib/SRC/zhbevd.f | 6 ---- lapack-netlib/SRC/zhbevd_2stage.f | 6 ---- lapack-netlib/SRC/zhbgvd.f | 6 ---- lapack-netlib/SRC/zheevd.f | 6 ---- lapack-netlib/SRC/zheevd_2stage.f | 6 ---- lapack-netlib/SRC/zhegvd.f | 6 ---- lapack-netlib/SRC/zhpevd.f | 6 ---- lapack-netlib/SRC/zhpgvd.f | 6 ---- lapack-netlib/SRC/zlaed8.f | 30 ++++++++--------- lapack-netlib/SRC/zlals0.f | 10 ++++++ lapack-netlib/SRC/zlalsd.f | 6 ---- lapack-netlib/SRC/zstedc.f | 6 ---- 74 files changed, 206 insertions(+), 681 deletions(-) diff --git a/lapack-netlib/SRC/cgelsd.f b/lapack-netlib/SRC/cgelsd.f index fce4ca6e2..c3c77bf63 100644 --- a/lapack-netlib/SRC/cgelsd.f +++ b/lapack-netlib/SRC/cgelsd.f @@ -60,12 +60,6 @@ *> singular values which are less than RCOND times the largest singular *> value. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f index b824374d3..1838629ae 100644 --- a/lapack-netlib/SRC/cgesdd.f +++ b/lapack-netlib/SRC/cgesdd.f @@ -53,12 +53,6 @@ *> *> Note that the routine returns VT = V**H, not V. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/chbevd.f b/lapack-netlib/SRC/chbevd.f index 1598f4de5..de33c9039 100644 --- a/lapack-netlib/SRC/chbevd.f +++ b/lapack-netlib/SRC/chbevd.f @@ -41,12 +41,6 @@ *> a complex Hermitian band matrix A. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/chbevd_2stage.f b/lapack-netlib/SRC/chbevd_2stage.f index 340c546e8..3c9c8ecc0 100644 --- a/lapack-netlib/SRC/chbevd_2stage.f +++ b/lapack-netlib/SRC/chbevd_2stage.f @@ -47,12 +47,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/chbgvd.f b/lapack-netlib/SRC/chbgvd.f index c4ad20753..655006370 100644 --- a/lapack-netlib/SRC/chbgvd.f +++ b/lapack-netlib/SRC/chbgvd.f @@ -46,12 +46,6 @@ *> and banded, and B is also positive definite. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f index 2ddf74b98..dce0b2083 100644 --- a/lapack-netlib/SRC/cheevd.f +++ b/lapack-netlib/SRC/cheevd.f @@ -41,12 +41,6 @@ *> complex Hermitian matrix A. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/cheevd_2stage.f b/lapack-netlib/SRC/cheevd_2stage.f index 830e13d30..a0e8843ae 100644 --- a/lapack-netlib/SRC/cheevd_2stage.f +++ b/lapack-netlib/SRC/cheevd_2stage.f @@ -46,12 +46,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f index c96f011af..4edc36f2a 100644 --- a/lapack-netlib/SRC/chegvd.f +++ b/lapack-netlib/SRC/chegvd.f @@ -43,12 +43,6 @@ *> B are assumed to be Hermitian and B is also positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/chpevd.f b/lapack-netlib/SRC/chpevd.f index c44462394..06d01064d 100644 --- a/lapack-netlib/SRC/chpevd.f +++ b/lapack-netlib/SRC/chpevd.f @@ -41,12 +41,6 @@ *> a complex Hermitian matrix A in packed storage. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f index 5c9e417d3..c24ca1360 100644 --- a/lapack-netlib/SRC/chpgvd.f +++ b/lapack-netlib/SRC/chpgvd.f @@ -44,12 +44,6 @@ *> positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/claed8.f b/lapack-netlib/SRC/claed8.f index c15a0365f..1600087ab 100644 --- a/lapack-netlib/SRC/claed8.f +++ b/lapack-netlib/SRC/claed8.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA, +* SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA, * Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR, * GIVCOL, GIVNUM, INFO ) * @@ -29,7 +29,7 @@ * .. Array Arguments .. * INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), * $ INDXQ( * ), PERM( * ) -* REAL D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ), +* REAL D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ), * $ Z( * ) * COMPLEX Q( LDQ, * ), Q2( LDQ2, * ) * .. @@ -122,9 +122,9 @@ *> destroyed during the updating process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is REAL array, dimension (N) +*> DLAMBDA is REAL array, dimension (N) *> Contains a copy of the first K eigenvalues which will be used *> by SLAED3 to form the secular equation. *> \endverbatim @@ -222,7 +222,7 @@ *> \ingroup complexOTHERcomputational * * ===================================================================== - SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA, + SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA, $ Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR, $ GIVCOL, GIVNUM, INFO ) * @@ -237,7 +237,7 @@ * .. Array Arguments .. INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), $ INDXQ( * ), PERM( * ) - REAL D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ), + REAL D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ), $ Z( * ) COMPLEX Q( LDQ, * ), Q2( LDQ2, * ) * .. @@ -322,14 +322,14 @@ INDXQ( I ) = INDXQ( I ) + CUTPNT 20 CONTINUE DO 30 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) W( I ) = Z( INDXQ( I ) ) 30 CONTINUE I = 1 J = CUTPNT + 1 - CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDX ) + CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDX ) DO 40 I = 1, N - D( I ) = DLAMDA( INDX( I ) ) + D( I ) = DLAMBDA( INDX( I ) ) Z( I ) = W( INDX( I ) ) 40 CONTINUE * @@ -438,7 +438,7 @@ ELSE K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM JLAM = J END IF @@ -450,19 +450,19 @@ * K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM * 100 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * DO 110 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) CALL CCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 ) 110 CONTINUE @@ -471,7 +471,7 @@ * into the last N - K slots of D and Q respectively. * IF( K.LT.N ) THEN - CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) CALL CLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, Q( 1, K+1 ), $ LDQ ) END IF diff --git a/lapack-netlib/SRC/clals0.f b/lapack-netlib/SRC/clals0.f index e981fc36f..0b545d5d7 100644 --- a/lapack-netlib/SRC/clals0.f +++ b/lapack-netlib/SRC/clals0.f @@ -392,6 +392,11 @@ $ ( POLES( I, 2 ).EQ.ZERO ) ) THEN RWORK( I ) = ZERO ELSE +* +* Use calls to the subroutine SLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* RWORK( I ) = POLES( I, 2 )*Z( I ) / $ ( SLAMC3( POLES( I, 2 ), DSIGJ )- $ DIFLJ ) / ( POLES( I, 2 )+DJ ) @@ -470,6 +475,11 @@ IF( Z( J ).EQ.ZERO ) THEN RWORK( I ) = ZERO ELSE +* +* Use calls to the subroutine SLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent optimizing +* compilers from doing x+(y+z). +* RWORK( I ) = Z( J ) / ( SLAMC3( DSIGJ, -POLES( I+1, $ 2 ) )-DIFR( I, 1 ) ) / $ ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 ) diff --git a/lapack-netlib/SRC/clalsd.f b/lapack-netlib/SRC/clalsd.f index a2da9a925..bdd6b31c5 100644 --- a/lapack-netlib/SRC/clalsd.f +++ b/lapack-netlib/SRC/clalsd.f @@ -48,12 +48,6 @@ *> problem; in this case a minimum norm solution is returned. *> The actual singular values are returned in D in ascending order. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/cstedc.f b/lapack-netlib/SRC/cstedc.f index a57d9eaef..77a4ec3be 100644 --- a/lapack-netlib/SRC/cstedc.f +++ b/lapack-netlib/SRC/cstedc.f @@ -43,12 +43,6 @@ *> be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this *> matrix to tridiagonal form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See SLAED3 for details. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dbdsdc.f b/lapack-netlib/SRC/dbdsdc.f index 99fe82296..4b6c3e694 100644 --- a/lapack-netlib/SRC/dbdsdc.f +++ b/lapack-netlib/SRC/dbdsdc.f @@ -45,13 +45,6 @@ *> respectively. DBDSDC can be used to compute all singular values, *> and optionally, singular vectors or singular vectors in compact form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See DLASD3 for details. -*> *> The code currently calls DLASDQ if singular values only are desired. *> However, it can be slightly modified to compute singular values *> using the divide and conquer method. diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f index b3b3d8b2d..b1f45a2c6 100644 --- a/lapack-netlib/SRC/dgelsd.f +++ b/lapack-netlib/SRC/dgelsd.f @@ -59,12 +59,6 @@ *> singular values which are less than RCOND times the largest singular *> value. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f index 738a122e8..87a4e702d 100644 --- a/lapack-netlib/SRC/dgesdd.f +++ b/lapack-netlib/SRC/dgesdd.f @@ -55,12 +55,6 @@ *> *> Note that the routine returns VT = V**T, not V. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dlaed2.f b/lapack-netlib/SRC/dlaed2.f index 9b1f1e093..1a53650e8 100644 --- a/lapack-netlib/SRC/dlaed2.f +++ b/lapack-netlib/SRC/dlaed2.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W, +* SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W, * Q2, INDX, INDXC, INDXP, COLTYP, INFO ) * * .. Scalar Arguments .. @@ -28,7 +28,7 @@ * .. Array Arguments .. * INTEGER COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ), * $ INDXQ( * ) -* DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), +* DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), * $ W( * ), Z( * ) * .. * @@ -123,9 +123,9 @@ *> process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is DOUBLE PRECISION array, dimension (N) +*> DLAMBDA is DOUBLE PRECISION array, dimension (N) *> A copy of the first K eigenvalues which will be used by *> DLAED3 to form the secular equation. *> \endverbatim @@ -148,7 +148,7 @@ *> \param[out] INDX *> \verbatim *> INDX is INTEGER array, dimension (N) -*> The permutation used to sort the contents of DLAMDA into +*> The permutation used to sort the contents of DLAMBDA into *> ascending order. *> \endverbatim *> @@ -207,7 +207,7 @@ *> Modified by Francoise Tisseur, University of Tennessee *> * ===================================================================== - SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W, + SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W, $ Q2, INDX, INDXC, INDXP, COLTYP, INFO ) * * -- LAPACK computational routine -- @@ -221,7 +221,7 @@ * .. Array Arguments .. INTEGER COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ), $ INDXQ( * ) - DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), + DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), $ W( * ), Z( * ) * .. * @@ -300,9 +300,9 @@ * re-integrate the deflated parts from the last pass * DO 20 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) 20 CONTINUE - CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDXC ) + CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDXC ) DO 30 I = 1, N INDX( I ) = INDXQ( INDXC( I ) ) 30 CONTINUE @@ -324,11 +324,11 @@ DO 40 J = 1, N I = INDX( J ) CALL DCOPY( N, Q( 1, I ), 1, Q2( IQ2 ), 1 ) - DLAMDA( J ) = D( I ) + DLAMBDA( J ) = D( I ) IQ2 = IQ2 + N 40 CONTINUE CALL DLACPY( 'A', N, N, Q2, N, Q, LDQ ) - CALL DCOPY( N, DLAMDA, 1, D, 1 ) + CALL DCOPY( N, DLAMBDA, 1, D, 1 ) GO TO 190 END IF * @@ -421,7 +421,7 @@ PJ = NJ ELSE K = K + 1 - DLAMDA( K ) = D( PJ ) + DLAMBDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ PJ = NJ @@ -433,7 +433,7 @@ * Record the last eigenvalue. * K = K + 1 - DLAMDA( K ) = D( PJ ) + DLAMBDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ * @@ -470,9 +470,9 @@ PSM( CT ) = PSM( CT ) + 1 130 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * I = 1 diff --git a/lapack-netlib/SRC/dlaed3.f b/lapack-netlib/SRC/dlaed3.f index c58944e60..f9982c89e 100644 --- a/lapack-netlib/SRC/dlaed3.f +++ b/lapack-netlib/SRC/dlaed3.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX, +* SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX, * CTOT, W, S, INFO ) * * .. Scalar Arguments .. @@ -27,7 +27,7 @@ * .. * .. Array Arguments .. * INTEGER CTOT( * ), INDX( * ) -* DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), +* DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), * $ S( * ), W( * ) * .. * @@ -44,12 +44,6 @@ *> being combined by the matrix of eigenvectors of the K-by-K system *> which is solved here. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -104,14 +98,12 @@ *> RHO >= 0 required. *> \endverbatim *> -*> \param[in,out] DLAMDA +*> \param[in] DLAMBDA *> \verbatim -*> DLAMDA is DOUBLE PRECISION array, dimension (K) +*> DLAMBDA is DOUBLE PRECISION array, dimension (K) *> The first K elements of this array contain the old roots *> of the deflated updating problem. These are the poles -*> of the secular equation. May be changed on output by -*> having lowest order bit set to zero on Cray X-MP, Cray Y-MP, -*> Cray-2, or Cray C-90, as described above. +*> of the secular equation. *> \endverbatim *> *> \param[in] Q2 @@ -180,7 +172,7 @@ *> Modified by Francoise Tisseur, University of Tennessee *> * ===================================================================== - SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX, + SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX, $ CTOT, W, S, INFO ) * * -- LAPACK computational routine -- @@ -193,7 +185,7 @@ * .. * .. Array Arguments .. INTEGER CTOT( * ), INDX( * ) - DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), + DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), $ S( * ), W( * ) * .. * @@ -208,8 +200,8 @@ DOUBLE PRECISION TEMP * .. * .. External Functions .. - DOUBLE PRECISION DLAMC3, DNRM2 - EXTERNAL DLAMC3, DNRM2 + DOUBLE PRECISION DNRM2 + EXTERNAL DNRM2 * .. * .. External Subroutines .. EXTERNAL DCOPY, DGEMM, DLACPY, DLAED4, DLASET, XERBLA @@ -240,29 +232,9 @@ IF( K.EQ.0 ) $ RETURN * -* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), -* which on any of these machines zeros out the bottommost -* bit of DLAMDA(I) if it is 1; this makes the subsequent -* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DLAMDA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DLAMDA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, K - DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I ) - 10 CONTINUE * DO 20 J = 1, K - CALL DLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO ) + CALL DLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO ) * * If the zero finder fails, the computation is terminated. * @@ -293,10 +265,10 @@ CALL DCOPY( K, Q, LDQ+1, W, 1 ) DO 60 J = 1, K DO 40 I = 1, J - 1 - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 40 CONTINUE DO 50 I = J + 1, K - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 50 CONTINUE 60 CONTINUE DO 70 I = 1, K diff --git a/lapack-netlib/SRC/dlaed8.f b/lapack-netlib/SRC/dlaed8.f index 3631fb456..5d1d9144d 100644 --- a/lapack-netlib/SRC/dlaed8.f +++ b/lapack-netlib/SRC/dlaed8.f @@ -19,7 +19,7 @@ * =========== * * SUBROUTINE DLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO, -* CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR, +* CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR, * GIVCOL, GIVNUM, INDXP, INDX, INFO ) * * .. Scalar Arguments .. @@ -30,7 +30,7 @@ * .. Array Arguments .. * INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), * $ INDXQ( * ), PERM( * ) -* DOUBLE PRECISION D( * ), DLAMDA( * ), GIVNUM( 2, * ), +* DOUBLE PRECISION D( * ), DLAMBDA( * ), GIVNUM( 2, * ), * $ Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * ) * .. * @@ -141,9 +141,9 @@ *> process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is DOUBLE PRECISION array, dimension (N) +*> DLAMBDA is DOUBLE PRECISION array, dimension (N) *> A copy of the first K eigenvalues which will be used by *> DLAED3 to form the secular equation. *> \endverbatim @@ -238,7 +238,7 @@ * * ===================================================================== SUBROUTINE DLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO, - $ CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR, + $ CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR, $ GIVCOL, GIVNUM, INDXP, INDX, INFO ) * * -- LAPACK computational routine -- @@ -253,7 +253,7 @@ * .. Array Arguments .. INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), $ INDXQ( * ), PERM( * ) - DOUBLE PRECISION D( * ), DLAMDA( * ), GIVNUM( 2, * ), + DOUBLE PRECISION D( * ), DLAMBDA( * ), GIVNUM( 2, * ), $ Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * ) * .. * @@ -339,14 +339,14 @@ INDXQ( I ) = INDXQ( I ) + CUTPNT 20 CONTINUE DO 30 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) W( I ) = Z( INDXQ( I ) ) 30 CONTINUE I = 1 J = CUTPNT + 1 - CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDX ) + CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDX ) DO 40 I = 1, N - D( I ) = DLAMDA( INDX( I ) ) + D( I ) = DLAMBDA( INDX( I ) ) Z( I ) = W( INDX( I ) ) 40 CONTINUE * @@ -464,7 +464,7 @@ ELSE K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM JLAM = J END IF @@ -476,26 +476,26 @@ * K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM * 110 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * IF( ICOMPQ.EQ.0 ) THEN DO 120 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) 120 CONTINUE ELSE DO 130 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) CALL DCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 ) 130 CONTINUE @@ -506,9 +506,9 @@ * IF( K.LT.N ) THEN IF( ICOMPQ.EQ.0 ) THEN - CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) ELSE - CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) CALL DLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, $ Q( 1, K+1 ), LDQ ) END IF diff --git a/lapack-netlib/SRC/dlaed9.f b/lapack-netlib/SRC/dlaed9.f index b88cdd907..0d209c2c2 100644 --- a/lapack-netlib/SRC/dlaed9.f +++ b/lapack-netlib/SRC/dlaed9.f @@ -18,15 +18,15 @@ * Definition: * =========== * -* SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W, -* S, LDS, INFO ) +* SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA, +* W, S, LDS, INFO ) * * .. Scalar Arguments .. * INTEGER INFO, K, KSTART, KSTOP, LDQ, LDS, N * DOUBLE PRECISION RHO * .. * .. Array Arguments .. -* DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ), +* DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ), * $ W( * ) * .. * @@ -96,9 +96,9 @@ *> RHO >= 0 required. *> \endverbatim *> -*> \param[in] DLAMDA +*> \param[in] DLAMBDA *> \verbatim -*> DLAMDA is DOUBLE PRECISION array, dimension (K) +*> DLAMBDA is DOUBLE PRECISION array, dimension (K) *> The first K elements of this array contain the old roots *> of the deflated updating problem. These are the poles *> of the secular equation. @@ -151,8 +151,8 @@ *> at Berkeley, USA * * ===================================================================== - SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W, - $ S, LDS, INFO ) + SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA, + $ W, S, LDS, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -163,7 +163,7 @@ DOUBLE PRECISION RHO * .. * .. Array Arguments .. - DOUBLE PRECISION D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ), + DOUBLE PRECISION D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ), $ W( * ) * .. * @@ -174,8 +174,8 @@ DOUBLE PRECISION TEMP * .. * .. External Functions .. - DOUBLE PRECISION DLAMC3, DNRM2 - EXTERNAL DLAMC3, DNRM2 + DOUBLE PRECISION DNRM2 + EXTERNAL DNRM2 * .. * .. External Subroutines .. EXTERNAL DCOPY, DLAED4, XERBLA @@ -212,30 +212,9 @@ * IF( K.EQ.0 ) $ RETURN -* -* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), -* which on any of these machines zeros out the bottommost -* bit of DLAMDA(I) if it is 1; this makes the subsequent -* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DLAMDA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DLAMDA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, N - DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I ) - 10 CONTINUE * DO 20 J = KSTART, KSTOP - CALL DLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO ) + CALL DLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO ) * * If the zero finder fails, the computation is terminated. * @@ -261,10 +240,10 @@ CALL DCOPY( K, Q, LDQ+1, W, 1 ) DO 70 J = 1, K DO 50 I = 1, J - 1 - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 50 CONTINUE DO 60 I = J + 1, K - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 60 CONTINUE 70 CONTINUE DO 80 I = 1, K diff --git a/lapack-netlib/SRC/dlals0.f b/lapack-netlib/SRC/dlals0.f index cfca22280..928405e22 100644 --- a/lapack-netlib/SRC/dlals0.f +++ b/lapack-netlib/SRC/dlals0.f @@ -389,6 +389,11 @@ $ ( POLES( I, 2 ).EQ.ZERO ) ) THEN WORK( I ) = ZERO ELSE +* +* Use calls to the subroutine DLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* WORK( I ) = POLES( I, 2 )*Z( I ) / $ ( DLAMC3( POLES( I, 2 ), DSIGJ )- $ DIFLJ ) / ( POLES( I, 2 )+DJ ) @@ -440,6 +445,11 @@ IF( Z( J ).EQ.ZERO ) THEN WORK( I ) = ZERO ELSE +* +* Use calls to the subroutine DLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* WORK( I ) = Z( J ) / ( DLAMC3( DSIGJ, -POLES( I+1, $ 2 ) )-DIFR( I, 1 ) ) / $ ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 ) diff --git a/lapack-netlib/SRC/dlalsd.f b/lapack-netlib/SRC/dlalsd.f index d22c45dc6..706ac4c90 100644 --- a/lapack-netlib/SRC/dlalsd.f +++ b/lapack-netlib/SRC/dlalsd.f @@ -47,12 +47,6 @@ *> problem; in this case a minimum norm solution is returned. *> The actual singular values are returned in D in ascending order. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dlas2.f b/lapack-netlib/SRC/dlas2.f index ea929e86f..ff75e3942 100644 --- a/lapack-netlib/SRC/dlas2.f +++ b/lapack-netlib/SRC/dlas2.f @@ -93,9 +93,7 @@ *> infinite. *> *> Overflow will not occur unless the largest singular value itself -*> overflows, or is within a few ulps of overflow. (On machines with -*> partial overflow, like the Cray, overflow may occur if the largest -*> singular value is within a factor of 2 of overflow.) +*> overflows, or is within a few ulps of overflow. *> *> Underflow is harmless if underflow is gradual. Otherwise, results *> may correspond to a matrix modified by perturbations of size near diff --git a/lapack-netlib/SRC/dlasd3.f b/lapack-netlib/SRC/dlasd3.f index df939efc5..44957377b 100644 --- a/lapack-netlib/SRC/dlasd3.f +++ b/lapack-netlib/SRC/dlasd3.f @@ -44,13 +44,6 @@ *> appropriate calls to DLASD4 and then updates the singular *> vectors by matrix multiplication. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. -*> *> DLASD3 is called from DLASD1. *> \endverbatim * @@ -103,7 +96,7 @@ *> The leading dimension of the array Q. LDQ >= K. *> \endverbatim *> -*> \param[in,out] DSIGMA +*> \param[in] DSIGMA *> \verbatim *> DSIGMA is DOUBLE PRECISION array, dimension(K) *> The first K elements of this array contain the old roots @@ -249,8 +242,8 @@ DOUBLE PRECISION RHO, TEMP * .. * .. External Functions .. - DOUBLE PRECISION DLAMC3, DNRM2 - EXTERNAL DLAMC3, DNRM2 + DOUBLE PRECISION DNRM2 + EXTERNAL DNRM2 * .. * .. External Subroutines .. EXTERNAL DCOPY, DGEMM, DLACPY, DLASCL, DLASD4, XERBLA @@ -310,27 +303,6 @@ RETURN END IF * -* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I), -* which on any of these machines zeros out the bottommost -* bit of DSIGMA(I) if it is 1; this makes the subsequent -* subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DSIGMA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DSIGMA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DSIGMA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 20 I = 1, K - DSIGMA( I ) = DLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I ) - 20 CONTINUE -* * Keep a copy of Z. * CALL DCOPY( K, Z, 1, Q, 1 ) diff --git a/lapack-netlib/SRC/dlasd8.f b/lapack-netlib/SRC/dlasd8.f index a769bdb22..73c3ef0b4 100644 --- a/lapack-netlib/SRC/dlasd8.f +++ b/lapack-netlib/SRC/dlasd8.f @@ -121,14 +121,12 @@ *> The leading dimension of DIFR, must be at least K. *> \endverbatim *> -*> \param[in,out] DSIGMA +*> \param[in] DSIGMA *> \verbatim *> DSIGMA is DOUBLE PRECISION array, dimension ( K ) *> On entry, the first K elements of this array contain the old *> roots of the deflated updating problem. These are the poles *> of the secular equation. -*> On exit, the elements of DSIGMA may be very slightly altered -*> in value. *> \endverbatim *> *> \param[out] WORK @@ -227,27 +225,6 @@ RETURN END IF * -* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I), -* which on any of these machines zeros out the bottommost -* bit of DSIGMA(I) if it is 1; this makes the subsequent -* subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DSIGMA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DSIGMA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, K - DSIGMA( I ) = DLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I ) - 10 CONTINUE -* * Book keeping. * IWK1 = 1 @@ -312,6 +289,11 @@ DSIGJP = -DSIGMA( J+1 ) END IF WORK( J ) = -Z( J ) / DIFLJ / ( DSIGMA( J )+DJ ) +* +* Use calls to the subroutine DLAMC3 to enforce the parentheses +* (x+y)+z. The goal is to prevent optimizing compilers +* from doing x+(y+z). +* DO 60 I = 1, J - 1 WORK( I ) = Z( I ) / ( DLAMC3( DSIGMA( I ), DSIGJ )-DIFLJ ) $ / ( DSIGMA( I )+DJ ) diff --git a/lapack-netlib/SRC/dlasv2.f b/lapack-netlib/SRC/dlasv2.f index 64a06dee1..cb2bf51c4 100644 --- a/lapack-netlib/SRC/dlasv2.f +++ b/lapack-netlib/SRC/dlasv2.f @@ -124,9 +124,7 @@ *> infinite. *> *> Overflow will not occur unless the largest singular value itself -*> overflows or is within a few ulps of overflow. (On machines with -*> partial overflow, like the Cray, overflow may occur if the largest -*> singular value is within a factor of 2 of overflow.) +*> overflows or is within a few ulps of overflow. *> *> Underflow is harmless if underflow is gradual. Otherwise, results *> may correspond to a matrix modified by perturbations of size near diff --git a/lapack-netlib/SRC/dsbevd.f b/lapack-netlib/SRC/dsbevd.f index 3eb4ed8df..350c0a9f0 100644 --- a/lapack-netlib/SRC/dsbevd.f +++ b/lapack-netlib/SRC/dsbevd.f @@ -40,12 +40,6 @@ *> a real symmetric band matrix A. If eigenvectors are desired, it uses *> a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dsbevd_2stage.f b/lapack-netlib/SRC/dsbevd_2stage.f index 45a64b478..82997c850 100644 --- a/lapack-netlib/SRC/dsbevd_2stage.f +++ b/lapack-netlib/SRC/dsbevd_2stage.f @@ -45,12 +45,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses *> a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dsbgvd.f b/lapack-netlib/SRC/dsbgvd.f index 30b016611..0ab3177ac 100644 --- a/lapack-netlib/SRC/dsbgvd.f +++ b/lapack-netlib/SRC/dsbgvd.f @@ -43,12 +43,6 @@ *> banded, and B is also positive definite. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dspevd.f b/lapack-netlib/SRC/dspevd.f index d9d6c8917..05aa91b03 100644 --- a/lapack-netlib/SRC/dspevd.f +++ b/lapack-netlib/SRC/dspevd.f @@ -40,12 +40,6 @@ *> of a real symmetric matrix A in packed storage. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f index ec3cdc1ac..24c2309c3 100644 --- a/lapack-netlib/SRC/dspgvd.f +++ b/lapack-netlib/SRC/dspgvd.f @@ -44,12 +44,6 @@ *> positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dstedc.f b/lapack-netlib/SRC/dstedc.f index 2ed84afaa..6d533664b 100644 --- a/lapack-netlib/SRC/dstedc.f +++ b/lapack-netlib/SRC/dstedc.f @@ -42,12 +42,6 @@ *> found if DSYTRD or DSPTRD or DSBTRD has been used to reduce this *> matrix to tridiagonal form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See DLAED3 for details. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dstevd.f b/lapack-netlib/SRC/dstevd.f index 507f39b2b..54717df3d 100644 --- a/lapack-netlib/SRC/dstevd.f +++ b/lapack-netlib/SRC/dstevd.f @@ -40,12 +40,6 @@ *> real symmetric tridiagonal matrix. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f index eaaecd8d9..b27f4cdc7 100644 --- a/lapack-netlib/SRC/dsyevd.f +++ b/lapack-netlib/SRC/dsyevd.f @@ -40,13 +40,6 @@ *> real symmetric matrix A. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. -*> *> Because of large use of BLAS of level 3, DSYEVD needs N**2 more *> workspace than DSYEVX. *> \endverbatim diff --git a/lapack-netlib/SRC/dsyevd_2stage.f b/lapack-netlib/SRC/dsyevd_2stage.f index 0eae8ad06..d5a68c35d 100644 --- a/lapack-netlib/SRC/dsyevd_2stage.f +++ b/lapack-netlib/SRC/dsyevd_2stage.f @@ -45,12 +45,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f index d6682d4e5..41a384c80 100644 --- a/lapack-netlib/SRC/dsygvd.f +++ b/lapack-netlib/SRC/dsygvd.f @@ -42,12 +42,6 @@ *> B are assumed to be symmetric and B is also positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/sbdsdc.f b/lapack-netlib/SRC/sbdsdc.f index 18a404497..2a6cc9970 100644 --- a/lapack-netlib/SRC/sbdsdc.f +++ b/lapack-netlib/SRC/sbdsdc.f @@ -45,13 +45,6 @@ *> respectively. SBDSDC can be used to compute all singular values, *> and optionally, singular vectors or singular vectors in compact form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See SLASD3 for details. -*> *> The code currently calls SLASDQ if singular values only are desired. *> However, it can be slightly modified to compute singular values *> using the divide and conquer method. diff --git a/lapack-netlib/SRC/sgelsd.f b/lapack-netlib/SRC/sgelsd.f index f5f17d34c..9fda7b593 100644 --- a/lapack-netlib/SRC/sgelsd.f +++ b/lapack-netlib/SRC/sgelsd.f @@ -59,12 +59,6 @@ *> singular values which are less than RCOND times the largest singular *> value. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f index d3b5e3ba1..d271bb757 100644 --- a/lapack-netlib/SRC/sgesdd.f +++ b/lapack-netlib/SRC/sgesdd.f @@ -55,12 +55,6 @@ *> *> Note that the routine returns VT = V**T, not V. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/slaed2.f b/lapack-netlib/SRC/slaed2.f index 16500e74c..cadf53555 100644 --- a/lapack-netlib/SRC/slaed2.f +++ b/lapack-netlib/SRC/slaed2.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W, +* SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W, * Q2, INDX, INDXC, INDXP, COLTYP, INFO ) * * .. Scalar Arguments .. @@ -28,7 +28,7 @@ * .. Array Arguments .. * INTEGER COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ), * $ INDXQ( * ) -* REAL D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), +* REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), * $ W( * ), Z( * ) * .. * @@ -123,9 +123,9 @@ *> process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is REAL array, dimension (N) +*> DLAMBDA is REAL array, dimension (N) *> A copy of the first K eigenvalues which will be used by *> SLAED3 to form the secular equation. *> \endverbatim @@ -148,7 +148,7 @@ *> \param[out] INDX *> \verbatim *> INDX is INTEGER array, dimension (N) -*> The permutation used to sort the contents of DLAMDA into +*> The permutation used to sort the contents of DLAMBDA into *> ascending order. *> \endverbatim *> @@ -207,7 +207,7 @@ *> Modified by Francoise Tisseur, University of Tennessee *> * ===================================================================== - SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W, + SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W, $ Q2, INDX, INDXC, INDXP, COLTYP, INFO ) * * -- LAPACK computational routine -- @@ -221,7 +221,7 @@ * .. Array Arguments .. INTEGER COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ), $ INDXQ( * ) - REAL D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), + REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), $ W( * ), Z( * ) * .. * @@ -300,9 +300,9 @@ * re-integrate the deflated parts from the last pass * DO 20 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) 20 CONTINUE - CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDXC ) + CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDXC ) DO 30 I = 1, N INDX( I ) = INDXQ( INDXC( I ) ) 30 CONTINUE @@ -324,11 +324,11 @@ DO 40 J = 1, N I = INDX( J ) CALL SCOPY( N, Q( 1, I ), 1, Q2( IQ2 ), 1 ) - DLAMDA( J ) = D( I ) + DLAMBDA( J ) = D( I ) IQ2 = IQ2 + N 40 CONTINUE CALL SLACPY( 'A', N, N, Q2, N, Q, LDQ ) - CALL SCOPY( N, DLAMDA, 1, D, 1 ) + CALL SCOPY( N, DLAMBDA, 1, D, 1 ) GO TO 190 END IF * @@ -421,7 +421,7 @@ PJ = NJ ELSE K = K + 1 - DLAMDA( K ) = D( PJ ) + DLAMBDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ PJ = NJ @@ -433,7 +433,7 @@ * Record the last eigenvalue. * K = K + 1 - DLAMDA( K ) = D( PJ ) + DLAMBDA( K ) = D( PJ ) W( K ) = Z( PJ ) INDXP( K ) = PJ * @@ -470,9 +470,9 @@ PSM( CT ) = PSM( CT ) + 1 130 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * I = 1 diff --git a/lapack-netlib/SRC/slaed3.f b/lapack-netlib/SRC/slaed3.f index e84f22be1..44c601f91 100644 --- a/lapack-netlib/SRC/slaed3.f +++ b/lapack-netlib/SRC/slaed3.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX, +* SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX, * CTOT, W, S, INFO ) * * .. Scalar Arguments .. @@ -27,7 +27,7 @@ * .. * .. Array Arguments .. * INTEGER CTOT( * ), INDX( * ) -* REAL D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), +* REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), * $ S( * ), W( * ) * .. * @@ -44,12 +44,6 @@ *> being combined by the matrix of eigenvectors of the K-by-K system *> which is solved here. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: @@ -104,14 +98,12 @@ *> RHO >= 0 required. *> \endverbatim *> -*> \param[in,out] DLAMDA +*> \param[in] DLAMBDA *> \verbatim -*> DLAMDA is REAL array, dimension (K) +*> DLAMBDA is REAL array, dimension (K) *> The first K elements of this array contain the old roots *> of the deflated updating problem. These are the poles -*> of the secular equation. May be changed on output by -*> having lowest order bit set to zero on Cray X-MP, Cray Y-MP, -*> Cray-2, or Cray C-90, as described above. +*> of the secular equation. *> \endverbatim *> *> \param[in] Q2 @@ -180,7 +172,7 @@ *> Modified by Francoise Tisseur, University of Tennessee *> * ===================================================================== - SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX, + SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX, $ CTOT, W, S, INFO ) * * -- LAPACK computational routine -- @@ -193,7 +185,7 @@ * .. * .. Array Arguments .. INTEGER CTOT( * ), INDX( * ) - REAL D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ), + REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ), $ S( * ), W( * ) * .. * @@ -208,8 +200,8 @@ REAL TEMP * .. * .. External Functions .. - REAL SLAMC3, SNRM2 - EXTERNAL SLAMC3, SNRM2 + REAL SNRM2 + EXTERNAL SNRM2 * .. * .. External Subroutines .. EXTERNAL SCOPY, SGEMM, SLACPY, SLAED4, SLASET, XERBLA @@ -239,30 +231,9 @@ * IF( K.EQ.0 ) $ RETURN -* -* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), -* which on any of these machines zeros out the bottommost -* bit of DLAMDA(I) if it is 1; this makes the subsequent -* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DLAMDA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DLAMDA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, K - DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I ) - 10 CONTINUE * DO 20 J = 1, K - CALL SLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO ) + CALL SLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO ) * * If the zero finder fails, the computation is terminated. * @@ -293,10 +264,10 @@ CALL SCOPY( K, Q, LDQ+1, W, 1 ) DO 60 J = 1, K DO 40 I = 1, J - 1 - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 40 CONTINUE DO 50 I = J + 1, K - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 50 CONTINUE 60 CONTINUE DO 70 I = 1, K diff --git a/lapack-netlib/SRC/slaed8.f b/lapack-netlib/SRC/slaed8.f index 9c8ba440c..9dd8a15f6 100644 --- a/lapack-netlib/SRC/slaed8.f +++ b/lapack-netlib/SRC/slaed8.f @@ -19,7 +19,7 @@ * =========== * * SUBROUTINE SLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO, -* CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR, +* CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR, * GIVCOL, GIVNUM, INDXP, INDX, INFO ) * * .. Scalar Arguments .. @@ -30,7 +30,7 @@ * .. Array Arguments .. * INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), * $ INDXQ( * ), PERM( * ) -* REAL D( * ), DLAMDA( * ), GIVNUM( 2, * ), +* REAL D( * ), DLAMBDA( * ), GIVNUM( 2, * ), * $ Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * ) * .. * @@ -141,9 +141,9 @@ *> process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is REAL array, dimension (N) +*> DLAMBDA is REAL array, dimension (N) *> A copy of the first K eigenvalues which will be used by *> SLAED3 to form the secular equation. *> \endverbatim @@ -238,7 +238,7 @@ * * ===================================================================== SUBROUTINE SLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO, - $ CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR, + $ CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR, $ GIVCOL, GIVNUM, INDXP, INDX, INFO ) * * -- LAPACK computational routine -- @@ -253,7 +253,7 @@ * .. Array Arguments .. INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), $ INDXQ( * ), PERM( * ) - REAL D( * ), DLAMDA( * ), GIVNUM( 2, * ), + REAL D( * ), DLAMBDA( * ), GIVNUM( 2, * ), $ Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * ) * .. * @@ -339,14 +339,14 @@ INDXQ( I ) = INDXQ( I ) + CUTPNT 20 CONTINUE DO 30 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) W( I ) = Z( INDXQ( I ) ) 30 CONTINUE I = 1 J = CUTPNT + 1 - CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDX ) + CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDX ) DO 40 I = 1, N - D( I ) = DLAMDA( INDX( I ) ) + D( I ) = DLAMBDA( INDX( I ) ) Z( I ) = W( INDX( I ) ) 40 CONTINUE * @@ -464,7 +464,7 @@ ELSE K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM JLAM = J END IF @@ -476,26 +476,26 @@ * K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM * 110 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * IF( ICOMPQ.EQ.0 ) THEN DO 120 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) 120 CONTINUE ELSE DO 130 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) CALL SCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 ) 130 CONTINUE @@ -506,9 +506,9 @@ * IF( K.LT.N ) THEN IF( ICOMPQ.EQ.0 ) THEN - CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) ELSE - CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) CALL SLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, $ Q( 1, K+1 ), LDQ ) END IF diff --git a/lapack-netlib/SRC/slaed9.f b/lapack-netlib/SRC/slaed9.f index 4d07416e9..d1b7b29fd 100644 --- a/lapack-netlib/SRC/slaed9.f +++ b/lapack-netlib/SRC/slaed9.f @@ -18,15 +18,15 @@ * Definition: * =========== * -* SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W, -* S, LDS, INFO ) +* SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA, +* W, S, LDS, INFO ) * * .. Scalar Arguments .. * INTEGER INFO, K, KSTART, KSTOP, LDQ, LDS, N * REAL RHO * .. * .. Array Arguments .. -* REAL D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ), +* REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ), * $ W( * ) * .. * @@ -96,9 +96,9 @@ *> RHO >= 0 required. *> \endverbatim *> -*> \param[in] DLAMDA +*> \param[in] DLAMBDA *> \verbatim -*> DLAMDA is REAL array, dimension (K) +*> DLAMBDA is REAL array, dimension (K) *> The first K elements of this array contain the old roots *> of the deflated updating problem. These are the poles *> of the secular equation. @@ -151,8 +151,8 @@ *> at Berkeley, USA * * ===================================================================== - SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W, - $ S, LDS, INFO ) + SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA, + $ W, S, LDS, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -163,7 +163,7 @@ REAL RHO * .. * .. Array Arguments .. - REAL D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ), + REAL D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ), $ W( * ) * .. * @@ -174,8 +174,8 @@ REAL TEMP * .. * .. External Functions .. - REAL SLAMC3, SNRM2 - EXTERNAL SLAMC3, SNRM2 + REAL SNRM2 + EXTERNAL SNRM2 * .. * .. External Subroutines .. EXTERNAL SCOPY, SLAED4, XERBLA @@ -212,30 +212,9 @@ * IF( K.EQ.0 ) $ RETURN -* -* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), -* which on any of these machines zeros out the bottommost -* bit of DLAMDA(I) if it is 1; this makes the subsequent -* subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DLAMDA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DLAMDA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, N - DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I ) - 10 CONTINUE * DO 20 J = KSTART, KSTOP - CALL SLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO ) + CALL SLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO ) * * If the zero finder fails, the computation is terminated. * @@ -261,10 +240,10 @@ CALL SCOPY( K, Q, LDQ+1, W, 1 ) DO 70 J = 1, K DO 50 I = 1, J - 1 - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 50 CONTINUE DO 60 I = J + 1, K - W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) ) + W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) ) 60 CONTINUE 70 CONTINUE DO 80 I = 1, K diff --git a/lapack-netlib/SRC/slals0.f b/lapack-netlib/SRC/slals0.f index 7d44e2864..f168dc653 100644 --- a/lapack-netlib/SRC/slals0.f +++ b/lapack-netlib/SRC/slals0.f @@ -389,6 +389,11 @@ $ ( POLES( I, 2 ).EQ.ZERO ) ) THEN WORK( I ) = ZERO ELSE +* +* Use calls to the subroutine SLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* WORK( I ) = POLES( I, 2 )*Z( I ) / $ ( SLAMC3( POLES( I, 2 ), DSIGJ )- $ DIFLJ ) / ( POLES( I, 2 )+DJ ) @@ -440,6 +445,11 @@ IF( Z( J ).EQ.ZERO ) THEN WORK( I ) = ZERO ELSE +* +* Use calls to the subroutine SLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* WORK( I ) = Z( J ) / ( SLAMC3( DSIGJ, -POLES( I+1, $ 2 ) )-DIFR( I, 1 ) ) / $ ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 ) diff --git a/lapack-netlib/SRC/slalsd.f b/lapack-netlib/SRC/slalsd.f index 2197f728e..9943a52d9 100644 --- a/lapack-netlib/SRC/slalsd.f +++ b/lapack-netlib/SRC/slalsd.f @@ -47,12 +47,6 @@ *> problem; in this case a minimum norm solution is returned. *> The actual singular values are returned in D in ascending order. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/slas2.f b/lapack-netlib/SRC/slas2.f index 6ae86204c..02ecbf434 100644 --- a/lapack-netlib/SRC/slas2.f +++ b/lapack-netlib/SRC/slas2.f @@ -93,9 +93,7 @@ *> infinite. *> *> Overflow will not occur unless the largest singular value itself -*> overflows, or is within a few ulps of overflow. (On machines with -*> partial overflow, like the Cray, overflow may occur if the largest -*> singular value is within a factor of 2 of overflow.) +*> overflows, or is within a few ulps of overflow. *> *> Underflow is harmless if underflow is gradual. Otherwise, results *> may correspond to a matrix modified by perturbations of size near diff --git a/lapack-netlib/SRC/slasd3.f b/lapack-netlib/SRC/slasd3.f index f9420f88a..8f74743c2 100644 --- a/lapack-netlib/SRC/slasd3.f +++ b/lapack-netlib/SRC/slasd3.f @@ -44,13 +44,6 @@ *> appropriate calls to SLASD4 and then updates the singular *> vectors by matrix multiplication. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. -*> *> SLASD3 is called from SLASD1. *> \endverbatim * @@ -103,7 +96,7 @@ *> The leading dimension of the array Q. LDQ >= K. *> \endverbatim *> -*> \param[in,out] DSIGMA +*> \param[in] DSIGMA *> \verbatim *> DSIGMA is REAL array, dimension(K) *> The first K elements of this array contain the old roots @@ -249,8 +242,8 @@ REAL RHO, TEMP * .. * .. External Functions .. - REAL SLAMC3, SNRM2 - EXTERNAL SLAMC3, SNRM2 + REAL SNRM2 + EXTERNAL SNRM2 * .. * .. External Subroutines .. EXTERNAL SCOPY, SGEMM, SLACPY, SLASCL, SLASD4, XERBLA @@ -310,27 +303,6 @@ RETURN END IF * -* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I), -* which on any of these machines zeros out the bottommost -* bit of DSIGMA(I) if it is 1; this makes the subsequent -* subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DSIGMA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DSIGMA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DSIGMA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 20 I = 1, K - DSIGMA( I ) = SLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I ) - 20 CONTINUE -* * Keep a copy of Z. * CALL SCOPY( K, Z, 1, Q, 1 ) diff --git a/lapack-netlib/SRC/slasd8.f b/lapack-netlib/SRC/slasd8.f index 43b171e5f..df5002367 100644 --- a/lapack-netlib/SRC/slasd8.f +++ b/lapack-netlib/SRC/slasd8.f @@ -121,14 +121,12 @@ *> The leading dimension of DIFR, must be at least K. *> \endverbatim *> -*> \param[in,out] DSIGMA +*> \param[in] DSIGMA *> \verbatim *> DSIGMA is REAL array, dimension ( K ) *> On entry, the first K elements of this array contain the old *> roots of the deflated updating problem. These are the poles *> of the secular equation. -*> On exit, the elements of DSIGMA may be very slightly altered -*> in value. *> \endverbatim *> *> \param[out] WORK @@ -227,27 +225,6 @@ RETURN END IF * -* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can -* be computed with high relative accuracy (barring over/underflow). -* This is a problem on machines without a guard digit in -* add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). -* The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I), -* which on any of these machines zeros out the bottommost -* bit of DSIGMA(I) if it is 1; this makes the subsequent -* subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation -* occurs. On binary machines with a guard digit (almost all -* machines) it does not change DSIGMA(I) at all. On hexadecimal -* and decimal machines with a guard digit, it slightly -* changes the bottommost bits of DSIGMA(I). It does not account -* for hexadecimal or decimal machines without guard digits -* (we know of none). We use a subroutine call to compute -* 2*DLAMBDA(I) to prevent optimizing compilers from eliminating -* this code. -* - DO 10 I = 1, K - DSIGMA( I ) = SLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I ) - 10 CONTINUE -* * Book keeping. * IWK1 = 1 @@ -312,6 +289,11 @@ DSIGJP = -DSIGMA( J+1 ) END IF WORK( J ) = -Z( J ) / DIFLJ / ( DSIGMA( J )+DJ ) +* +* Use calls to the subroutine SLAMC3 to enforce the parentheses +* (x+y)+z. The goal is to prevent optimizing compilers +* from doing x+(y+z). +* DO 60 I = 1, J - 1 WORK( I ) = Z( I ) / ( SLAMC3( DSIGMA( I ), DSIGJ )-DIFLJ ) $ / ( DSIGMA( I )+DJ ) diff --git a/lapack-netlib/SRC/slasv2.f b/lapack-netlib/SRC/slasv2.f index 6b98e9a0c..bf5d3ea0e 100644 --- a/lapack-netlib/SRC/slasv2.f +++ b/lapack-netlib/SRC/slasv2.f @@ -124,9 +124,7 @@ *> infinite. *> *> Overflow will not occur unless the largest singular value itself -*> overflows or is within a few ulps of overflow. (On machines with -*> partial overflow, like the Cray, overflow may occur if the largest -*> singular value is within a factor of 2 of overflow.) +*> overflows or is within a few ulps of overflow. *> *> Underflow is harmless if underflow is gradual. Otherwise, results *> may correspond to a matrix modified by perturbations of size near diff --git a/lapack-netlib/SRC/ssbevd.f b/lapack-netlib/SRC/ssbevd.f index bcf14ce85..e87f9a030 100644 --- a/lapack-netlib/SRC/ssbevd.f +++ b/lapack-netlib/SRC/ssbevd.f @@ -40,12 +40,6 @@ *> a real symmetric band matrix A. If eigenvectors are desired, it uses *> a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/ssbevd_2stage.f b/lapack-netlib/SRC/ssbevd_2stage.f index 9687ee024..014bade48 100644 --- a/lapack-netlib/SRC/ssbevd_2stage.f +++ b/lapack-netlib/SRC/ssbevd_2stage.f @@ -45,12 +45,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses *> a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/ssbgvd.f b/lapack-netlib/SRC/ssbgvd.f index 6dd1fe952..7c21ee455 100644 --- a/lapack-netlib/SRC/ssbgvd.f +++ b/lapack-netlib/SRC/ssbgvd.f @@ -43,12 +43,6 @@ *> banded, and B is also positive definite. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/sspevd.f b/lapack-netlib/SRC/sspevd.f index 56329da34..0872e95ac 100644 --- a/lapack-netlib/SRC/sspevd.f +++ b/lapack-netlib/SRC/sspevd.f @@ -40,12 +40,6 @@ *> of a real symmetric matrix A in packed storage. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f index 8ce2311fa..1a88365f2 100644 --- a/lapack-netlib/SRC/sspgvd.f +++ b/lapack-netlib/SRC/sspgvd.f @@ -44,12 +44,6 @@ *> positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/sstedc.f b/lapack-netlib/SRC/sstedc.f index 925b03422..61e3c2fda 100644 --- a/lapack-netlib/SRC/sstedc.f +++ b/lapack-netlib/SRC/sstedc.f @@ -42,12 +42,6 @@ *> found if SSYTRD or SSPTRD or SSBTRD has been used to reduce this *> matrix to tridiagonal form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See SLAED3 for details. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/sstevd.f b/lapack-netlib/SRC/sstevd.f index bc5b5aaab..218af8c76 100644 --- a/lapack-netlib/SRC/sstevd.f +++ b/lapack-netlib/SRC/sstevd.f @@ -40,12 +40,6 @@ *> real symmetric tridiagonal matrix. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f index ac0d0284d..ee0e33384 100644 --- a/lapack-netlib/SRC/ssyevd.f +++ b/lapack-netlib/SRC/ssyevd.f @@ -40,13 +40,6 @@ *> real symmetric matrix A. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. -*> *> Because of large use of BLAS of level 3, SSYEVD needs N**2 more *> workspace than SSYEVX. *> \endverbatim diff --git a/lapack-netlib/SRC/ssyevd_2stage.f b/lapack-netlib/SRC/ssyevd_2stage.f index f3fde6b4a..e63e280a7 100644 --- a/lapack-netlib/SRC/ssyevd_2stage.f +++ b/lapack-netlib/SRC/ssyevd_2stage.f @@ -45,12 +45,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f index 79f12a6f9..3c8bd2a0e 100644 --- a/lapack-netlib/SRC/ssygvd.f +++ b/lapack-netlib/SRC/ssygvd.f @@ -42,12 +42,6 @@ *> B are assumed to be symmetric and B is also positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zgelsd.f b/lapack-netlib/SRC/zgelsd.f index 01793e16c..b5bc768e8 100644 --- a/lapack-netlib/SRC/zgelsd.f +++ b/lapack-netlib/SRC/zgelsd.f @@ -60,12 +60,6 @@ *> singular values which are less than RCOND times the largest singular *> value. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f index 30d18a3a0..7f203afa5 100644 --- a/lapack-netlib/SRC/zgesdd.f +++ b/lapack-netlib/SRC/zgesdd.f @@ -53,12 +53,6 @@ *> *> Note that the routine returns VT = V**H, not V. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zhbevd.f b/lapack-netlib/SRC/zhbevd.f index 0db551540..be9f01556 100644 --- a/lapack-netlib/SRC/zhbevd.f +++ b/lapack-netlib/SRC/zhbevd.f @@ -41,12 +41,6 @@ *> a complex Hermitian band matrix A. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zhbevd_2stage.f b/lapack-netlib/SRC/zhbevd_2stage.f index 4522d5e79..e32c7125c 100644 --- a/lapack-netlib/SRC/zhbevd_2stage.f +++ b/lapack-netlib/SRC/zhbevd_2stage.f @@ -47,12 +47,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it *> uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zhbgvd.f b/lapack-netlib/SRC/zhbgvd.f index b0664750e..4bd02168d 100644 --- a/lapack-netlib/SRC/zhbgvd.f +++ b/lapack-netlib/SRC/zhbgvd.f @@ -46,12 +46,6 @@ *> and banded, and B is also positive definite. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f index 7f58c7f72..ba52f9e72 100644 --- a/lapack-netlib/SRC/zheevd.f +++ b/lapack-netlib/SRC/zheevd.f @@ -41,12 +41,6 @@ *> complex Hermitian matrix A. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zheevd_2stage.f b/lapack-netlib/SRC/zheevd_2stage.f index 9859b0d67..e697a9823 100644 --- a/lapack-netlib/SRC/zheevd_2stage.f +++ b/lapack-netlib/SRC/zheevd_2stage.f @@ -46,12 +46,6 @@ *> the reduction to tridiagonal. If eigenvectors are desired, it uses a *> divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f index 2c3586517..c9ff55e3d 100644 --- a/lapack-netlib/SRC/zhegvd.f +++ b/lapack-netlib/SRC/zhegvd.f @@ -43,12 +43,6 @@ *> B are assumed to be Hermitian and B is also positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zhpevd.f b/lapack-netlib/SRC/zhpevd.f index 7625c8fe8..5260aaf14 100644 --- a/lapack-netlib/SRC/zhpevd.f +++ b/lapack-netlib/SRC/zhpevd.f @@ -41,12 +41,6 @@ *> a complex Hermitian matrix A in packed storage. If eigenvectors are *> desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f index e9688f0c7..dfe92067c 100644 --- a/lapack-netlib/SRC/zhpgvd.f +++ b/lapack-netlib/SRC/zhpgvd.f @@ -44,12 +44,6 @@ *> positive definite. *> If eigenvectors are desired, it uses a divide and conquer algorithm. *> -*> The divide and conquer algorithm makes very mild assumptions about -*> floating point arithmetic. It will work on machines with a guard -*> digit in add/subtract, or on those binary machines without guard -*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or -*> Cray-2. It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zlaed8.f b/lapack-netlib/SRC/zlaed8.f index 995a673de..003725820 100644 --- a/lapack-netlib/SRC/zlaed8.f +++ b/lapack-netlib/SRC/zlaed8.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA, +* SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA, * Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR, * GIVCOL, GIVNUM, INFO ) * @@ -29,7 +29,7 @@ * .. Array Arguments .. * INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), * $ INDXQ( * ), PERM( * ) -* DOUBLE PRECISION D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ), +* DOUBLE PRECISION D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ), * $ Z( * ) * COMPLEX*16 Q( LDQ, * ), Q2( LDQ2, * ) * .. @@ -122,9 +122,9 @@ *> destroyed during the updating process. *> \endverbatim *> -*> \param[out] DLAMDA +*> \param[out] DLAMBDA *> \verbatim -*> DLAMDA is DOUBLE PRECISION array, dimension (N) +*> DLAMBDA is DOUBLE PRECISION array, dimension (N) *> Contains a copy of the first K eigenvalues which will be used *> by DLAED3 to form the secular equation. *> \endverbatim @@ -222,7 +222,7 @@ *> \ingroup complex16OTHERcomputational * * ===================================================================== - SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA, + SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA, $ Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR, $ GIVCOL, GIVNUM, INFO ) * @@ -237,7 +237,7 @@ * .. Array Arguments .. INTEGER GIVCOL( 2, * ), INDX( * ), INDXP( * ), $ INDXQ( * ), PERM( * ) - DOUBLE PRECISION D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ), + DOUBLE PRECISION D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ), $ Z( * ) COMPLEX*16 Q( LDQ, * ), Q2( LDQ2, * ) * .. @@ -322,14 +322,14 @@ INDXQ( I ) = INDXQ( I ) + CUTPNT 20 CONTINUE DO 30 I = 1, N - DLAMDA( I ) = D( INDXQ( I ) ) + DLAMBDA( I ) = D( INDXQ( I ) ) W( I ) = Z( INDXQ( I ) ) 30 CONTINUE I = 1 J = CUTPNT + 1 - CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDX ) + CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDX ) DO 40 I = 1, N - D( I ) = DLAMDA( INDX( I ) ) + D( I ) = DLAMBDA( INDX( I ) ) Z( I ) = W( INDX( I ) ) 40 CONTINUE * @@ -438,7 +438,7 @@ ELSE K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM JLAM = J END IF @@ -450,19 +450,19 @@ * K = K + 1 W( K ) = Z( JLAM ) - DLAMDA( K ) = D( JLAM ) + DLAMBDA( K ) = D( JLAM ) INDXP( K ) = JLAM * 100 CONTINUE * -* Sort the eigenvalues and corresponding eigenvectors into DLAMDA +* Sort the eigenvalues and corresponding eigenvectors into DLAMBDA * and Q2 respectively. The eigenvalues/vectors which were not -* deflated go into the first K slots of DLAMDA and Q2 respectively, +* deflated go into the first K slots of DLAMBDA and Q2 respectively, * while those which were deflated go into the last N - K slots. * DO 110 J = 1, N JP = INDXP( J ) - DLAMDA( J ) = D( JP ) + DLAMBDA( J ) = D( JP ) PERM( J ) = INDXQ( INDX( JP ) ) CALL ZCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 ) 110 CONTINUE @@ -471,7 +471,7 @@ * into the last N - K slots of D and Q respectively. * IF( K.LT.N ) THEN - CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 ) + CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 ) CALL ZLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, Q( 1, K+1 ), $ LDQ ) END IF diff --git a/lapack-netlib/SRC/zlals0.f b/lapack-netlib/SRC/zlals0.f index 7a7310042..79c0cf5e4 100644 --- a/lapack-netlib/SRC/zlals0.f +++ b/lapack-netlib/SRC/zlals0.f @@ -392,6 +392,11 @@ $ ( POLES( I, 2 ).EQ.ZERO ) ) THEN RWORK( I ) = ZERO ELSE +* +* Use calls to the subroutine DLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* RWORK( I ) = POLES( I, 2 )*Z( I ) / $ ( DLAMC3( POLES( I, 2 ), DSIGJ )- $ DIFLJ ) / ( POLES( I, 2 )+DJ ) @@ -470,6 +475,11 @@ IF( Z( J ).EQ.ZERO ) THEN RWORK( I ) = ZERO ELSE +* +* Use calls to the subroutine DLAMC3 to enforce the +* parentheses (x+y)+z. The goal is to prevent +* optimizing compilers from doing x+(y+z). +* RWORK( I ) = Z( J ) / ( DLAMC3( DSIGJ, -POLES( I+1, $ 2 ) )-DIFR( I, 1 ) ) / $ ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 ) diff --git a/lapack-netlib/SRC/zlalsd.f b/lapack-netlib/SRC/zlalsd.f index dca308e56..1d7358aa9 100644 --- a/lapack-netlib/SRC/zlalsd.f +++ b/lapack-netlib/SRC/zlalsd.f @@ -48,12 +48,6 @@ *> problem; in this case a minimum norm solution is returned. *> The actual singular values are returned in D in ascending order. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/SRC/zstedc.f b/lapack-netlib/SRC/zstedc.f index 74d390af7..e62063a19 100644 --- a/lapack-netlib/SRC/zstedc.f +++ b/lapack-netlib/SRC/zstedc.f @@ -43,12 +43,6 @@ *> be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this *> matrix to tridiagonal form. *> -*> This code makes very mild assumptions about floating point -*> arithmetic. It will work on machines with a guard digit in -*> add/subtract, or on those binary machines without guard digits -*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. -*> It could conceivably fail on hexadecimal or decimal machines -*> without guard digits, but we know of none. See DLAED3 for details. *> \endverbatim * * Arguments: From b9fb63c05e06684653b5e44e8a6a3d5037bbbd89 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Jun 2023 16:25:08 +0200 Subject: [PATCH 165/718] Remove OMP-private variable outside OMP context (LAPACK PR860) --- lapack-netlib/SRC/chetrd_hb2st.F | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index e047ab720..30b01ed83 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -537,7 +537,7 @@ C END IF $ STIND, EDIND, SWEEPID, N, KD, IB, $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, - $ WORK( INDW + TID*KD ) ) + $ WORK( INDW ) ) #endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 From 5b6921780868806c14a9a8123c649f31216a9a9c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Jun 2023 16:26:21 +0200 Subject: [PATCH 166/718] Update zhetrd_hb2st.FRemove OMP-private variable outside OMP context (LAPACK PR860) --- lapack-netlib/SRC/zhetrd_hb2st.F | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index e839271a4..1d39ac942 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -538,7 +538,7 @@ C END IF $ STIND, EDIND, SWEEPID, N, KD, IB, $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, - $ WORK( INDW + TID*KD ) ) + $ WORK( INDW ) ) #endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 From a0e413ef054993a79cc7909cbc147e22ec2d60c9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Jun 2023 16:28:15 +0200 Subject: [PATCH 167/718] Update ssytrd_sb2st.FRemove OMP-private variable outside OMP context (LAPACK PR860) --- lapack-netlib/SRC/ssytrd_sb2st.F | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index faeeff5f4..b8386670a 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -506,7 +506,7 @@ $ STIND, EDIND, SWEEPID, N, KD, IB, $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, - $ WORK( INDW + TID*KD ) ) + $ WORK( INDW ) ) #endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 From 49202228d75b806c6e1ac7fe63e61b3b6493b856 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Jun 2023 16:29:04 +0200 Subject: [PATCH 168/718] Remove OMP-private variable outside OMP context (LAPACK PR860) --- lapack-netlib/SRC/dsytrd_sb2st.F | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index fabc10756..bb74dd491 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -506,7 +506,7 @@ $ STIND, EDIND, SWEEPID, N, KD, IB, $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, - $ WORK( INDW + TID*KD ) ) + $ WORK( INDW ) ) #endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 From d6be5036d7495d3471d4e7c0cf8bb791b029e811 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Jun 2023 21:19:33 +0200 Subject: [PATCH 169/718] Fix IDAMAX --- kernel/riscv64/iamax_vector.c | 40 +++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c index 9fea522f7..4242af6ea 100644 --- a/kernel/riscv64/iamax_vector.c +++ b/kernel/riscv64/iamax_vector.c @@ -29,8 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(DOUBLE) - -#define ABS fabs +#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 #define VSETVL(n) vsetvl_e64m8(n) #define VSETVL_MAX vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m8_t @@ -54,8 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VADDVX_UINT vadd_vx_u64m8 #define VMVVX_UINT vmv_v_x_u64m8 #else - -#define ABS fabsf +#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 #define VSETVL(n) vsetvl_e32m8(n) #define VSETVL_MAX vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m8_t @@ -85,7 +83,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; FLOAT maxf=0.0; +#ifdef DOUBLE + BLASLONG max_index = 0; +#else unsigned int max_index = 0; +#endif if (n <= 0 || inc_x <= 0) return(max_index); FLOAT_V_T vx, v_max; @@ -117,11 +119,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl; } v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = *((FLOAT*)&v_res); + maxf = VFMVFS_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); - max_index = *((unsigned int*)&v_max_index+max_index); - +#ifdef DOUBLE + max_index = *((BLASLONG *)&v_max_index+max_index); +#else + max_index = *((unsigned int *)&v_max_index+max_index); +#endif if(j < n){ gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); @@ -130,7 +135,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + FLOAT cur_maxf = VFMVFS_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -138,7 +143,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); max_index = VMFIRSTM(mask,gvl); +#ifdef DOUBLE + max_index = *((BLASLONG*)&v_max_index+max_index); +#else max_index = *((unsigned int*)&v_max_index+max_index); +#endif } } }else{ @@ -165,11 +174,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) idx += inc_v; } v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - maxf = *((FLOAT*)&v_res); + maxf = VFMVFS_FLOAT(v_res); mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); +#ifdef DOUBLE + max_index = *((BLASLONG*)&v_max_index+max_index); +#else max_index = *((unsigned int*)&v_max_index+max_index); - +#endif if(j < n){ gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); @@ -178,7 +190,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); - FLOAT cur_maxf = *((FLOAT*)&v_res); + FLOAT cur_maxf = VFMVFS_FLOAT(v_res); if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -186,11 +198,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); max_index = VMFIRSTM(mask,gvl); +#ifdef DOUBLE + max_index = *((BLASLONG*)&v_max_index+max_index); +#else max_index = *((unsigned int*)&v_max_index+max_index); +#endif } } } return(max_index+1); } - - From 772b0cc71514409511eb4efd4b5770ae40b2f4e7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Jun 2023 16:12:27 +0200 Subject: [PATCH 170/718] Fix early bailout --- kernel/riscv64/dot_vector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c index f47e0c0b5..cc27d68ed 100644 --- a/kernel/riscv64/dot_vector.c +++ b/kernel/riscv64/dot_vector.c @@ -63,7 +63,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG i=0, j=0; double dot = 0.0 ; - if ( n < 0 ) return(dot); + if ( n < 1 ) return(dot); FLOAT_V_T vr, vx, vy; unsigned int gvl = 0; From ceaee7dc645da97e5e99ee7c0b2a9be8709b32c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Jun 2023 16:13:23 +0200 Subject: [PATCH 171/718] remove the limitation to -O1 again --- Makefile.riscv64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index 0246c0f7d..ce91e03ec 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -1,4 +1,4 @@ ifeq ($(CORE), C910V) -CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -O1 +CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif From 616fdea82ad91ebb481d7da469d0b238473bc9b7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Jun 2023 09:45:17 +0200 Subject: [PATCH 172/718] Revert "Improve Windows threading performance scaling" --- CONTRIBUTORS.md | 5 -- driver/others/blas_server_win32.c | 110 +++++++++++++++++------------- 2 files changed, 61 insertions(+), 54 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 7efc04092..71df13634 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -216,8 +216,3 @@ In chronological order: * Pablo Romero * [2022-08] Fix building from sources for QNX - -* Mark Seminatore - * [2023-06-23] Fix bounds issue in goto_set_num_threads - * [2023-06-23] Improve Windows threading performance scaling - \ No newline at end of file diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 0b213bf2c..afa33cccc 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -50,19 +50,11 @@ /* This is a thread implementation for Win32 lazy implementation */ -#if defined (__GNUC__) && (__GNUC__ < 6) - #define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) -#else - #if defined(_WIN64) - #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) - #else - #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) - #endif -#endif - /* Thread server common information */ typedef struct{ - HANDLE taskSemaphore; + CRITICAL_SECTION lock; + HANDLE filled; + HANDLE killed; blas_queue_t *queue; /* Parameter Pointer */ int shutdown; /* server shutdown flag */ @@ -79,6 +71,8 @@ static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; + + static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ @@ -204,6 +198,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* This is a main routine of threads. Each thread waits until job is */ /* queued. */ + static DWORD WINAPI blas_thread_server(void *arg){ /* Thread identifier */ @@ -212,7 +207,9 @@ static DWORD WINAPI blas_thread_server(void *arg){ #endif void *buffer, *sa, *sb; - volatile blas_queue_t *queue; + blas_queue_t *queue; + DWORD action; + HANDLE handles[] = {pool.filled, pool.killed}; /* Each server needs each buffer */ buffer = blas_memory_alloc(2); @@ -229,32 +226,28 @@ static DWORD WINAPI blas_thread_server(void *arg){ fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); #endif - // all worker threads wait on the semaphore - WaitForSingleObject(pool.taskSemaphore, INFINITE); + do { + action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); + } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); + + if (action == WAIT_OBJECT_0 + 1) break; - // kill the thread if we are shutting down the server - if (pool.shutdown) - break; - #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); #endif - // grab a queued task and update the list - volatile blas_queue_t* queue_next; - INT_PTR prev_value; - do { - queue = (volatile blas_queue_t*)pool.queue; - if (!queue) - break; + EnterCriticalSection(&pool.lock); + + queue = pool.queue; + if (queue) pool.queue = queue->next; - queue_next = (volatile blas_queue_t*)queue->next; - prev_value = WIN_CAS((INT_PTR*)&pool.queue, (INT_PTR)queue_next, (INT_PTR)queue); - } while (prev_value != queue); + LeaveCriticalSection(&pool.lock); if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + if (pool.queue) SetEvent(pool.filled); + sa = queue -> sa; sb = queue -> sb; @@ -339,8 +332,13 @@ static DWORD WINAPI blas_thread_server(void *arg){ fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); #endif - // mark our sub-task as complete - InterlockedDecrement(&queue->status); + EnterCriticalSection(&queue->lock); + + queue -> status = BLAS_STATUS_FINISHED; + + LeaveCriticalSection(&queue->lock); + + SetEvent(queue->finish); } /* Shutdown procedure */ @@ -355,7 +353,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ } /* Initializing routine */ - int blas_thread_init(void){ +int blas_thread_init(void){ BLASLONG i; if (blas_server_avail || (blas_cpu_number <= 1)) return 0; @@ -369,7 +367,9 @@ static DWORD WINAPI blas_thread_server(void *arg){ if (!blas_server_avail){ - pool.taskSemaphore = CreateSemaphore(NULL, 0, blas_cpu_number - 1, NULL); + InitializeCriticalSection(&pool.lock); + pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); + pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); pool.shutdown = 0; pool.queue = NULL; @@ -391,10 +391,11 @@ static DWORD WINAPI blas_thread_server(void *arg){ /* User can call one of two routines. - exec_blas_async ... immediately returns after jobs are queued. + exec_blas_async ... immediately returns after jobs are queued. - exec_blas ... returns after jobs are finished. + exec_blas ... returns after jobs are finished. */ + int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ #if defined(SMP_SERVER) @@ -408,7 +409,8 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ current = queue; while (current) { - current->status = 1; + InitializeCriticalSection(¤t -> lock); + current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL); current -> position = pos; #ifdef CONSISTENT_FPCSR @@ -420,10 +422,19 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ pos ++; } - pool.queue = queue; + EnterCriticalSection(&pool.lock); + + if (pool.queue) { + current = pool.queue; + while (current -> next) current = current -> next; + current -> next = queue; + } else { + pool.queue = queue; + } + + LeaveCriticalSection(&pool.lock); - // start up worker threads - ReleaseSemaphore(pool.taskSemaphore, pos - 1, NULL); + SetEvent(pool.filled); return 0; } @@ -439,9 +450,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ fprintf(STDERR, "Waiting Queue ..\n"); #endif - // spin-wait on each sub-task to finish - while (*((volatile int*)&queue->status)) - YIELDING; + WaitForSingleObject(queue->finish, INFINITE); + + CloseHandle(queue->finish); + DeleteCriticalSection(&queue -> lock); queue = queue -> next; num --; @@ -489,21 +501,18 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ /* Shutdown procedure, but user don't have to call this routine. The */ /* kernel automatically kill threads. */ + int BLASFUNC(blas_thread_shutdown)(void){ int i; -#ifdef SMP_DEBUG - fprintf(STDERR, "blas_thread_shutdown..\n"); -#endif - if (!blas_server_avail) return 0; LOCK_COMMAND(&server_lock); if (blas_server_avail){ - pool.shutdown = 1; + SetEvent(pool.killed); for(i = 0; i < blas_num_threads - 1; i++){ // Could also just use WaitForMultipleObjects @@ -519,7 +528,8 @@ int BLASFUNC(blas_thread_shutdown)(void){ CloseHandle(blas_threads[i]); } - CloseHandle(pool.taskSemaphore); + CloseHandle(pool.filled); + CloseHandle(pool.killed); blas_server_avail = 0; } @@ -549,14 +559,16 @@ void goto_set_num_threads(int num_threads) //increased_threads = 1; if (!blas_server_avail){ - pool.taskSemaphore = CreateSemaphore(NULL, 0, blas_cpu_number - 1, NULL); + InitializeCriticalSection(&pool.lock); + pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); + pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); pool.shutdown = 0; pool.queue = NULL; blas_server_avail = 1; } - for(i = blas_num_threads; i < num_threads - 1; i++){ + for(i = blas_num_threads - 1; i < num_threads - 1; i++){ blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, From e14a025bb1f5abbd7a71c1524882526d7bc5e28f Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 28 Jun 2023 11:17:38 +0000 Subject: [PATCH 173/718] Temporily walk around zaxpy vector kernel bug. --- kernel/riscv64/KERNEL.C910V | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 0da66fa35..18cb3bafd 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c SAXPYKERNEL = axpy_vector.c DAXPYKERNEL = axpy_vector.c -CAXPYKERNEL = zaxpy_vector.c -ZAXPYKERNEL = zaxpy_vector.c +CAXPYKERNEL = zaxpy.c +ZAXPYKERNEL = zaxpy.c SAXPBYKERNEL = axpby_vector.c DAXPBYKERNEL = axpby_vector.c From f7b916076b1f32ccedbbd44c84ab9784d6fbfda2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Jun 2023 21:30:59 +0200 Subject: [PATCH 174/718] Add Jenkins configuration files for OSUOSL powerci and ibmz-ci --- Jenkinsfile | 15 ++++++++++----- Jenkinsfile.pwr | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) create mode 100644 Jenkinsfile.pwr diff --git a/Jenkinsfile b/Jenkinsfile index 2b61bed9f..5fad6a95b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,9 +1,14 @@ -node { - stage('Checkout') { - checkout +pipeline { + agent { + docker { + image 'osuosl/ubuntu-s390x' } - + } + stages { stage('Build') { - sh("make") + steps { + sh 'make' + } } + } } diff --git a/Jenkinsfile.pwr b/Jenkinsfile.pwr new file mode 100644 index 000000000..d141ed8a5 --- /dev/null +++ b/Jenkinsfile.pwr @@ -0,0 +1,16 @@ +pipeline { + agent { + docker { + image 'osuosl/ubuntu-ppc64le' + } + } + stages { + stage('Build') { + steps { + sh 'sudo apt update' + sh 'sudo apt install gfortran -y' + sh 'make' + } + } + } +} From bb862b82d54bc3c29e905179035e8839742f1ea9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Jun 2023 23:59:25 +0200 Subject: [PATCH 175/718] Fix integer overflow in multithreading threshold calculation for SYMM/SYRK (#4116) * Fix potential integer overflow --- interface/symm.c | 4 ++-- interface/syrk.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/symm.c b/interface/symm.c index 3e65e69b1..3e6e0fd48 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -166,7 +166,7 @@ void NAME(char *SIDE, char *UPLO, int nodes; #endif # if defined(SMP) - int MN; + double MN; #endif blasint info; int side; @@ -264,7 +264,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, int nodes; #endif #if defined(SMP) - int MN; + double MN; #endif PRINT_DEBUG_CNAME; diff --git a/interface/syrk.c b/interface/syrk.c index 3b056aec8..69f2328a4 100644 --- a/interface/syrk.c +++ b/interface/syrk.c @@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, FLOAT *sa, *sb; #ifdef SMP - int NNK; + double NNK; #ifdef USE_SIMPLE_THREADED_LEVEL3 #ifndef COMPLEX #ifdef XDOUBLE @@ -232,7 +232,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr FLOAT *sa, *sb; #ifdef SMP -int NNK; +double NNK; #ifdef USE_SIMPLE_THREADED_LEVEL3 #ifndef COMPLEX From 6deb52812d61dffcc8e60dd355ca2d763ddbe55e Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 30 Jun 2023 14:11:01 +0800 Subject: [PATCH 176/718] LoongArch64: Add WhereAmI() --- common_loongarch64.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/common_loongarch64.h b/common_loongarch64.h index e15539b5f..ce1fcf091 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } +#ifndef NO_AFFINITY +static inline int WhereAmI(void){ + int ret = 0, counter = 0; + __asm__ volatile ( + "rdtimel.w %[counter], %[id]" + : [id]"=r"(ret), [counter]"=r"(counter) + : + : "memory" + ); + return ret; +} +#endif + #ifdef DOUBLE #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") #else From 4d0b7fbec04c95c90291938a5974f00673e10e68 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Jul 2023 12:38:57 +0200 Subject: [PATCH 177/718] Adjust M1 crossbuilds after image autoupgrade and add an M1-to-Android crossbuild on Cirrus CI (#4117) * Fix Apple crossbuilds after Cirrus updated the image; add a crossbuild to Android/ARMV7 --- .cirrus.yml | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index bef2b1e86..8a1c4a0a8 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -32,7 +32,7 @@ task: - make macos_instance: - image: ghcr.io/cirruslabs/macos-monterey-xcode:13.4 + image: ghcr.io/cirruslabs/macos-monterey-xcode:latest task: name: AppleM1/LLVM x86_64 xbuild compile_script: @@ -47,10 +47,11 @@ task: - export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64" - export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64" - export VALID_ARCHS="i386 x86_64" - - #find /Applications/Xcode-13.4.1.app -name libunwind.dylib - - export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator15.5.sdk -arch x86_64 -miphoneos-version-min=10.0" - - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + - xcrun --sdk macosx --show-sdk-path + - xcodebuild -version + - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64" + - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: path: "*conf*" @@ -60,7 +61,7 @@ task: # type: application/octet-streamm macos_instance: - image: ghcr.io/cirruslabs/macos-monterey-xcode:13.4 + image: ghcr.io/cirruslabs/macos-monterey-xcode:latest task: name: AppleM1/LLVM armv8-ios xbuild compile_script: @@ -68,15 +69,33 @@ task: - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - find /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms -name "IPhoneOS*sdk" - - export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS15.5.sdk -arch arm64 -miphoneos-version-min=10.0" + - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 always: config_artifacts: path: "*conf*" type: text/plain +macos_instance: + image: ghcr.io/cirruslabs/macos-monterey-xcode:latest +task: + name: AppleM1/LLVM armv7-androidndk xbuild + compile_script: + - #brew install android-ndk + - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH + - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib" + - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" + - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang + - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" + always: + config_artifacts: + path: "*conf*" + type: text/plain + task: name: NeoverseN1 arm_container: From a32af56761f2df742baf976b07b9b5ff72df5342 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Jul 2023 21:50:15 +0200 Subject: [PATCH 178/718] Add status badges for OSUOSL's POWERCI and IBMZ-CI services --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index a2eac07be..081d45870 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,12 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS) + [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) +OSUOSL POWERCI [![Build Status](https://powerci.osuosl.org/buildStatus/icon?job=OpenBLAS_gh%2Fdevelop)](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/) +OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=OpenBLAS-Z%2Fdevelop)](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/) ## Introduction OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. From ac698cedad4bd0840e165a22c0aafdc35a281ae3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 5 Jul 2023 09:47:49 +0200 Subject: [PATCH 179/718] Add compiler options for ARM64 SVE targets in DYNAMIC_ARCH builds --- cmake/system.cmake | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 3dc6c863e..414193ec8 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -280,7 +280,29 @@ if (DEFINED TARGET) if (${TARGET} STREQUAL POWER8) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") endif() + +if (${TARGET} STREQUAL NEOVERSEV1) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") + else () + message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.") + endif() + endif() + if (${TARGET} STREQUAL NEOVERSEN2) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + else () + message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.") + endif() + endif() + if (${TARGET} STREQUAL ARMV8SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") + endif() + endif() + if (DEFINED BINARY) message(STATUS "Compiling a ${BINARY}-bit binary.") endif () From 2f9f70584e4d1bb34dee6d273ffa38e9b4474710 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 5 Jul 2023 09:48:54 +0200 Subject: [PATCH 180/718] Fix target list syntax for ARM64 DYNAMIC_ARCH --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index f70019800..e6e434a0a 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2") + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) From 1363a7c4f12f4188869feea554e66585fc52db3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 5 Jul 2023 10:10:13 +0200 Subject: [PATCH 181/718] Correct the order of eigenvalues/vector for 2x2 matrices (Reference-LAPACK PR 867) --- lapack-netlib/SRC/cstemr.f | 36 +++++++++++++++++++++++++++++------- lapack-netlib/SRC/dstemr.f | 36 +++++++++++++++++++++++++++++------- lapack-netlib/SRC/sstemr.f | 36 +++++++++++++++++++++++++++++------- lapack-netlib/SRC/zstemr.f | 34 ++++++++++++++++++++++++++++------ 4 files changed, 115 insertions(+), 27 deletions(-) diff --git a/lapack-netlib/SRC/cstemr.f b/lapack-netlib/SRC/cstemr.f index d49684db3..9d47450e3 100644 --- a/lapack-netlib/SRC/cstemr.f +++ b/lapack-netlib/SRC/cstemr.f @@ -320,7 +320,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup stemr * *> \par Contributors: * ================== @@ -329,7 +329,8 @@ *> Jim Demmel, University of California, Berkeley, USA \n *> Inderjit Dhillon, University of Texas, Austin, USA \n *> Osni Marques, LBNL/NERSC, USA \n -*> Christof Voemel, University of California, Berkeley, USA +*> Christof Voemel, University of California, Berkeley, USA \n +*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n * * ===================================================================== SUBROUTINE CSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, @@ -361,7 +362,8 @@ $ MINRGP = 3.0E-3 ) * .. * .. Local Scalars .. - LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY + LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY, + $ LAESWAP INTEGER I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW, $ IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD, $ INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP, @@ -397,6 +399,7 @@ * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) + LAESWAP = .FALSE. * SSTEMR needs WORK of size 6*N, IWORK of size 3*N. * In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N. @@ -519,6 +522,15 @@ ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN ) END IF +* D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However, +* the following code requires R1 >= R2. Hence, we correct +* the order of R1, R2, CS, SN if R1 < R2 before further processing. + IF( R1.LT.R2 ) THEN + E(2) = R1 + R1 = R2 + R2 = E(2) + LAESWAP = .TRUE. + ENDIF IF( ALLEIG.OR. $ (VALEIG.AND.(R2.GT.WL).AND. $ (R2.LE.WU)).OR. @@ -526,8 +538,13 @@ M = M+1 W( M ) = R2 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = -SN - Z( 2, M ) = CS + IF( LAESWAP ) THEN + Z( 1, M ) = CS + Z( 2, M ) = SN + ELSE + Z( 1, M ) = -SN + Z( 2, M ) = CS + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN @@ -550,8 +567,13 @@ M = M+1 W( M ) = R1 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = CS - Z( 2, M ) = SN + IF( LAESWAP ) THEN + Z( 1, M ) = -SN + Z( 2, M ) = CS + ELSE + Z( 1, M ) = CS + Z( 2, M ) = SN + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN diff --git a/lapack-netlib/SRC/dstemr.f b/lapack-netlib/SRC/dstemr.f index d0c71ddd9..44a33423e 100644 --- a/lapack-netlib/SRC/dstemr.f +++ b/lapack-netlib/SRC/dstemr.f @@ -303,7 +303,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup stemr * *> \par Contributors: * ================== @@ -312,7 +312,8 @@ *> Jim Demmel, University of California, Berkeley, USA \n *> Inderjit Dhillon, University of Texas, Austin, USA \n *> Osni Marques, LBNL/NERSC, USA \n -*> Christof Voemel, University of California, Berkeley, USA +*> Christof Voemel, University of California, Berkeley, USA \n +*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n * * ===================================================================== SUBROUTINE DSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, @@ -344,7 +345,8 @@ $ MINRGP = 1.0D-3 ) * .. * .. Local Scalars .. - LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY + LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY, + $ LAESWAP INTEGER I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW, $ IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD, $ INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP, @@ -380,6 +382,7 @@ * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) + LAESWAP = .FALSE. * DSTEMR needs WORK of size 6*N, IWORK of size 3*N. * In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N. @@ -502,6 +505,15 @@ ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN ) END IF +* D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However, +* the following code requires R1 >= R2. Hence, we correct +* the order of R1, R2, CS, SN if R1 < R2 before further processing. + IF( R1.LT.R2 ) THEN + E(2) = R1 + R1 = R2 + R2 = E(2) + LAESWAP = .TRUE. + ENDIF IF( ALLEIG.OR. $ (VALEIG.AND.(R2.GT.WL).AND. $ (R2.LE.WU)).OR. @@ -509,8 +521,13 @@ M = M+1 W( M ) = R2 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = -SN - Z( 2, M ) = CS + IF( LAESWAP ) THEN + Z( 1, M ) = CS + Z( 2, M ) = SN + ELSE + Z( 1, M ) = -SN + Z( 2, M ) = CS + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN @@ -533,8 +550,13 @@ M = M+1 W( M ) = R1 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = CS - Z( 2, M ) = SN + IF( LAESWAP ) THEN + Z( 1, M ) = -SN + Z( 2, M ) = CS + ELSE + Z( 1, M ) = CS + Z( 2, M ) = SN + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN diff --git a/lapack-netlib/SRC/sstemr.f b/lapack-netlib/SRC/sstemr.f index 3a9bbe784..2ed697b69 100644 --- a/lapack-netlib/SRC/sstemr.f +++ b/lapack-netlib/SRC/sstemr.f @@ -303,7 +303,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup stemr * *> \par Contributors: * ================== @@ -312,7 +312,8 @@ *> Jim Demmel, University of California, Berkeley, USA \n *> Inderjit Dhillon, University of Texas, Austin, USA \n *> Osni Marques, LBNL/NERSC, USA \n -*> Christof Voemel, University of California, Berkeley, USA +*> Christof Voemel, University of California, Berkeley, USA \n +*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n * * ===================================================================== SUBROUTINE SSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, @@ -344,7 +345,8 @@ $ MINRGP = 3.0E-3 ) * .. * .. Local Scalars .. - LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY + LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY, + $ LAESWAP INTEGER I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW, $ IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD, $ INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP, @@ -378,6 +380,7 @@ * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) + LAESWAP = .FALSE. * SSTEMR needs WORK of size 6*N, IWORK of size 3*N. * In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N. @@ -500,6 +503,15 @@ ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN ) END IF +* D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However, +* the following code requires R1 >= R2. Hence, we correct +* the order of R1, R2, CS, SN if R1 < R2 before further processing. + IF( R1.LT.R2 ) THEN + E(2) = R1 + R1 = R2 + R2 = E(2) + LAESWAP = .TRUE. + ENDIF IF( ALLEIG.OR. $ (VALEIG.AND.(R2.GT.WL).AND. $ (R2.LE.WU)).OR. @@ -507,8 +519,13 @@ M = M+1 W( M ) = R2 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = -SN - Z( 2, M ) = CS + IF( LAESWAP ) THEN + Z( 1, M ) = CS + Z( 2, M ) = SN + ELSE + Z( 1, M ) = -SN + Z( 2, M ) = CS + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN @@ -531,8 +548,13 @@ M = M+1 W( M ) = R1 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = CS - Z( 2, M ) = SN + IF( LAESWAP ) THEN + Z( 1, M ) = -SN + Z( 2, M ) = CS + ELSE + Z( 1, M ) = CS + Z( 2, M ) = SN + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN diff --git a/lapack-netlib/SRC/zstemr.f b/lapack-netlib/SRC/zstemr.f index b034198de..4eaf5ef97 100644 --- a/lapack-netlib/SRC/zstemr.f +++ b/lapack-netlib/SRC/zstemr.f @@ -320,7 +320,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup stemr * *> \par Contributors: * ================== @@ -330,6 +330,7 @@ *> Inderjit Dhillon, University of Texas, Austin, USA \n *> Osni Marques, LBNL/NERSC, USA \n *> Christof Voemel, University of California, Berkeley, USA \n +*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n * * ===================================================================== SUBROUTINE ZSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, @@ -361,7 +362,8 @@ $ MINRGP = 1.0D-3 ) * .. * .. Local Scalars .. - LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY + LOGICAL ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY, + $ LAESWAP INTEGER I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW, $ IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD, $ INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP, @@ -397,6 +399,7 @@ * LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) ) ZQUERY = ( NZC.EQ.-1 ) + LAESWAP = .FALSE. * DSTEMR needs WORK of size 6*N, IWORK of size 3*N. * In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N. @@ -519,6 +522,15 @@ ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN ) END IF +* D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However, +* the following code requires R1 >= R2. Hence, we correct +* the order of R1, R2, CS, SN if R1 < R2 before further processing. + IF( R1.LT.R2 ) THEN + E(2) = R1 + R1 = R2 + R2 = E(2) + LAESWAP = .TRUE. + ENDIF IF( ALLEIG.OR. $ (VALEIG.AND.(R2.GT.WL).AND. $ (R2.LE.WU)).OR. @@ -526,8 +538,13 @@ M = M+1 W( M ) = R2 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = -SN - Z( 2, M ) = CS + IF( LAESWAP ) THEN + Z( 1, M ) = CS + Z( 2, M ) = SN + ELSE + Z( 1, M ) = -SN + Z( 2, M ) = CS + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN @@ -550,8 +567,13 @@ M = M+1 W( M ) = R1 IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN - Z( 1, M ) = CS - Z( 2, M ) = SN + IF( LAESWAP ) THEN + Z( 1, M ) = -SN + Z( 2, M ) = CS + ELSE + Z( 1, M ) = CS + Z( 2, M ) = SN + ENDIF * Note: At most one of SN and CS can be zero. IF (SN.NE.ZERO) THEN IF (CS.NE.ZERO) THEN From e1958eb70529c36d7dc4f3baf9e7bf37524053ab Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Wed, 5 Jul 2023 11:34:00 +0100 Subject: [PATCH 182/718] Fixes RVV masked intrinsics for iamax/iamin/imax/imin kernels Changes masked intrinsics from _m to _mu and reintroduces maskedoff argument. --- kernel/riscv64/iamax_rvv.c | 16 ++++++++-------- kernel/riscv64/iamin_rvv.c | 16 ++++++++-------- kernel/riscv64/imax_rvv.c | 16 ++++++++-------- kernel/riscv64/imin_rvv.c | 16 ++++++++-------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c index ef7850a55..d3508a91d 100644 --- a/kernel/riscv64/iamax_rvv.c +++ b/kernel/riscv64/iamax_rvv.c @@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -71,9 +71,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -106,8 +106,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -125,8 +125,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max v_max = VFMAXVV_FLOAT(v_max, vx, vl); diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c index 56a086fed..ae1d4f726 100644 --- a/kernel/riscv64/iamin_rvv.c +++ b/kernel/riscv64/iamin_rvv.c @@ -46,9 +46,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -72,9 +72,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -107,8 +107,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -126,8 +126,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c index 5b60a56f7..33250568d 100644 --- a/kernel/riscv64/imax_rvv.c +++ b/kernel/riscv64/imax_rvv.c @@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -70,9 +70,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, vl); diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c index b49544a1b..4ce49c3af 100644 --- a/kernel/riscv64/imin_rvv.c +++ b/kernel/riscv64/imin_rvv.c @@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -70,9 +70,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); @@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, vl); From 069353bd445235deca47b4c902346432b24450d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 5 Jul 2023 14:13:58 +0200 Subject: [PATCH 183/718] Add reciprocal scaling of a complex vector and use it in C/ZGETF2 (Reference-LAPACK PR839) --- lapack-netlib/SRC/cgetf2.f | 25 ++--- lapack-netlib/SRC/crscl.f | 202 ++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/zgetf2.f | 17 +--- lapack-netlib/SRC/zrscl.f | 203 +++++++++++++++++++++++++++++++++++++ 4 files changed, 416 insertions(+), 31 deletions(-) create mode 100644 lapack-netlib/SRC/crscl.f create mode 100644 lapack-netlib/SRC/zrscl.f diff --git a/lapack-netlib/SRC/cgetf2.f b/lapack-netlib/SRC/cgetf2.f index aac989970..995ee40ec 100644 --- a/lapack-netlib/SRC/cgetf2.f +++ b/lapack-netlib/SRC/cgetf2.f @@ -101,7 +101,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup getf2 * * ===================================================================== SUBROUTINE CGETF2( M, N, A, LDA, IPIV, INFO ) @@ -126,16 +126,14 @@ $ ZERO = ( 0.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. - REAL SFMIN - INTEGER I, J, JP + INTEGER J, JP * .. * .. External Functions .. - REAL SLAMCH INTEGER ICAMAX - EXTERNAL SLAMCH, ICAMAX + EXTERNAL ICAMAX * .. * .. External Subroutines .. - EXTERNAL CGERU, CSCAL, CSWAP, XERBLA + EXTERNAL CGERU, CRSCL, CSWAP, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -161,10 +159,6 @@ * IF( M.EQ.0 .OR. N.EQ.0 ) $ RETURN -* -* Compute machine safe minimum -* - SFMIN = SLAMCH('S') * DO 10 J = 1, MIN( M, N ) * @@ -181,15 +175,8 @@ * * Compute elements J+1:M of J-th column. * - IF( J.LT.M ) THEN - IF( ABS(A( J, J )) .GE. SFMIN ) THEN - CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) - ELSE - DO 20 I = 1, M-J - A( J+I, J ) = A( J+I, J ) / A( J, J ) - 20 CONTINUE - END IF - END IF + IF( J.LT.M ) + $ CALL CRSCL( M-J, A( J, J ), A( J+1, J ), 1 ) * ELSE IF( INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/crscl.f b/lapack-netlib/SRC/crscl.f new file mode 100644 index 000000000..22919cd62 --- /dev/null +++ b/lapack-netlib/SRC/crscl.f @@ -0,0 +1,202 @@ +*> \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CRSCL + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CRSCL( N, A, X, INCX ) +* +* .. Scalar Arguments .. +* INTEGER INCX, N +* COMPLEX A +* .. +* .. Array Arguments .. +* COMPLEX X( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CRSCL multiplies an n-element complex vector x by the complex scalar +*> 1/a. This is done without overflow or underflow as long as +*> the final result x/a does not overflow or underflow. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of components of the vector x. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX +*> The scalar a which is used to divide each component of x. +*> A must not be 0, or the subroutine will divide by zero. +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX array, dimension +*> (1+(N-1)*abs(INCX)) +*> The n-element vector x. +*> \endverbatim +*> +*> \param[in] INCX +*> \verbatim +*> INCX is INTEGER +*> The increment between successive values of the vector X. +*> > 0: X(1) = X(1) and X(1+(i-1)*INCX) = x(i), 1< i<= n +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexOTHERauxiliary +* +* ===================================================================== + SUBROUTINE CRSCL( N, A, X, INCX ) +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INCX, N + COMPLEX A +* .. +* .. Array Arguments .. + COMPLEX X( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + REAL SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR + % , UI +* .. +* .. External Functions .. + REAL SLAMCH + COMPLEX CLADIV + EXTERNAL SLAMCH, CLADIV +* .. +* .. External Subroutines .. + EXTERNAL CSCAL, CSSCAL, CSRSCL +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( N.LE.0 ) + $ RETURN +* +* Get machine parameters +* + SAFMIN = SLAMCH( 'S' ) + SAFMAX = ONE / SAFMIN + OV = SLAMCH( 'O' ) +* +* Initialize constants related to A. +* + AR = REAL( A ) + AI = AIMAG( A ) + ABSR = ABS( AR ) + ABSI = ABS( AI ) +* + IF( AI.EQ.ZERO ) THEN +* If alpha is real, then we can use csrscl + CALL CSRSCL( N, AR, X, INCX ) +* + ELSE IF( AR.EQ.ZERO ) THEN +* If alpha has a zero real part, then we follow the same rules as if +* alpha were real. + IF( ABSI.GT.SAFMAX ) THEN + CALL CSSCAL( N, SAFMIN, X, INCX ) + CALL CSCAL( N, CMPLX( ZERO, -SAFMAX / AI ), X, INCX ) + ELSE IF( ABSI.LT.SAFMIN ) THEN + CALL CSCAL( N, CMPLX( ZERO, -SAFMIN / AI ), X, INCX ) + CALL CSSCAL( N, SAFMAX, X, INCX ) + ELSE + CALL CSCAL( N, CMPLX( ZERO, -ONE / AI ), X, INCX ) + END IF +* + ELSE +* The following numbers can be computed. +* They are the inverse of the real and imaginary parts of 1/alpha. +* Note that a and b are always different from zero. +* NaNs are only possible if either: +* 1. alphaR or alphaI is NaN. +* 2. alphaR and alphaI are both infinite, in which case it makes sense +* to propagate a NaN. + UR = AR + AI * ( AI / AR ) + UI = AI + AR * ( AR / AI ) +* + IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN +* This means that both alphaR and alphaI are very small. + CALL CSCAL( N, CMPLX( SAFMIN / UR, -SAFMIN / UI ), X, INCX ) + CALL CSSCAL( N, SAFMAX, X, INCX ) + ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN + IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN +* This means that a and b are both Inf. No need for scaling. + CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX ) + ELSE + CALL CSSCAL( N, SAFMIN, X, INCX ) + IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN +* Infs were generated. We do proper scaling to avoid them. + IF( ABSR.GE.ABSI ) THEN +* ABS( UR ) <= ABS( UI ) + UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR )) + UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI ) + ELSE +* ABS( UR ) > ABS( UI ) + UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR ) + UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI )) + END IF + CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX ) + ELSE + CALL CSCAL( N, CMPLX( SAFMAX / UR, -SAFMAX / UI ), + $ X, INCX ) + END IF + END IF + ELSE + CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX ) + END IF + END IF +* + RETURN +* +* End of CRSCL +* + END diff --git a/lapack-netlib/SRC/zgetf2.f b/lapack-netlib/SRC/zgetf2.f index c247f8645..7c63dbbee 100644 --- a/lapack-netlib/SRC/zgetf2.f +++ b/lapack-netlib/SRC/zgetf2.f @@ -101,7 +101,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup getf2 * * ===================================================================== SUBROUTINE ZGETF2( M, N, A, LDA, IPIV, INFO ) @@ -127,7 +127,7 @@ * .. * .. Local Scalars .. DOUBLE PRECISION SFMIN - INTEGER I, J, JP + INTEGER J, JP * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -135,7 +135,7 @@ EXTERNAL DLAMCH, IZAMAX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZGERU, ZSCAL, ZSWAP + EXTERNAL XERBLA, ZGERU, ZRSCL, ZSWAP * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -181,15 +181,8 @@ * * Compute elements J+1:M of J-th column. * - IF( J.LT.M ) THEN - IF( ABS(A( J, J )) .GE. SFMIN ) THEN - CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) - ELSE - DO 20 I = 1, M-J - A( J+I, J ) = A( J+I, J ) / A( J, J ) - 20 CONTINUE - END IF - END IF + IF( J.LT.M ) + $ CALL ZRSCL( M-J, A( J, J ), A( J+1, J ), 1 ) * ELSE IF( INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/zrscl.f b/lapack-netlib/SRC/zrscl.f new file mode 100644 index 000000000..970f6de75 --- /dev/null +++ b/lapack-netlib/SRC/zrscl.f @@ -0,0 +1,203 @@ +*> \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZDRSCL + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZRSCL( N, A, X, INCX ) +* +* .. Scalar Arguments .. +* INTEGER INCX, N +* COMPLEX*16 A +* .. +* .. Array Arguments .. +* COMPLEX*16 X( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZRSCL multiplies an n-element complex vector x by the complex scalar +*> 1/a. This is done without overflow or underflow as long as +*> the final result x/a does not overflow or underflow. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of components of the vector x. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 +*> The scalar a which is used to divide each component of x. +*> A must not be 0, or the subroutine will divide by zero. +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension +*> (1+(N-1)*abs(INCX)) +*> The n-element vector x. +*> \endverbatim +*> +*> \param[in] INCX +*> \verbatim +*> INCX is INTEGER +*> The increment between successive values of the vector SX. +*> > 0: SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i), 1< i<= n +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16OTHERauxiliary +* +* ===================================================================== + SUBROUTINE ZRSCL( N, A, X, INCX ) +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INCX, N + COMPLEX*16 A +* .. +* .. Array Arguments .. + COMPLEX*16 X( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR, UI +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + COMPLEX*16 ZLADIV + EXTERNAL DLAMCH, ZLADIV +* .. +* .. External Subroutines .. + EXTERNAL DSCAL, ZDSCAL, ZDRSCL +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( N.LE.0 ) + $ RETURN +* +* Get machine parameters +* + SAFMIN = DLAMCH( 'S' ) + SAFMAX = ONE / SAFMIN + OV = DLAMCH( 'O' ) +* +* Initialize constants related to A. +* + AR = DBLE( A ) + AI = DIMAG( A ) + ABSR = ABS( AR ) + ABSI = ABS( AI ) +* + IF( AI.EQ.ZERO ) THEN +* If alpha is real, then we can use csrscl + CALL ZDRSCL( N, AR, X, INCX ) +* + ELSE IF( AR.EQ.ZERO ) THEN +* If alpha has a zero real part, then we follow the same rules as if +* alpha were real. + IF( ABSI.GT.SAFMAX ) THEN + CALL ZDSCAL( N, SAFMIN, X, INCX ) + CALL ZSCAL( N, DCMPLX( ZERO, -SAFMAX / AI ), X, INCX ) + ELSE IF( ABSI.LT.SAFMIN ) THEN + CALL ZSCAL( N, DCMPLX( ZERO, -SAFMIN / AI ), X, INCX ) + CALL ZDSCAL( N, SAFMAX, X, INCX ) + ELSE + CALL ZSCAL( N, DCMPLX( ZERO, -ONE / AI ), X, INCX ) + END IF +* + ELSE +* The following numbers can be computed. +* They are the inverse of the real and imaginary parts of 1/alpha. +* Note that a and b are always different from zero. +* NaNs are only possible if either: +* 1. alphaR or alphaI is NaN. +* 2. alphaR and alphaI are both infinite, in which case it makes sense +* to propagate a NaN. + UR = AR + AI * ( AI / AR ) + UI = AI + AR * ( AR / AI ) +* + IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN +* This means that both alphaR and alphaI are very small. + CALL ZSCAL( N, DCMPLX( SAFMIN / UR, -SAFMIN / UI ), X, + $ INCX ) + CALL ZDSCAL( N, SAFMAX, X, INCX ) + ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN + IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN +* This means that a and b are both Inf. No need for scaling. + CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX ) + ELSE + CALL ZDSCAL( N, SAFMIN, X, INCX ) + IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN +* Infs were generated. We do proper scaling to avoid them. + IF( ABSR.GE.ABSI ) THEN +* ABS( UR ) <= ABS( UI ) + UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR )) + UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI ) + ELSE +* ABS( UR ) > ABS( UI ) + UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR ) + UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI )) + END IF + CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, + $ INCX ) + ELSE + CALL ZSCAL( N, DCMPLX( SAFMAX / UR, -SAFMAX / UI ), + $ X, INCX ) + END IF + END IF + ELSE + CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX ) + END IF + END IF +* + RETURN +* +* End of ZRSCL +* + END From 1d399ad0d6c7b7d1cb99f7385ac9c30d82fbd41e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 5 Jul 2023 14:17:26 +0200 Subject: [PATCH 184/718] Add CRSCL/ZRSCL (Reference-LAPACK PR839) --- lapack-netlib/SRC/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 74db14e46..c75fd5f49 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -280,7 +280,7 @@ CLASRC_O = \ cposv.o cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \ cppcon.o cppequ.o cpprfs.o cppsv.o cppsvx.o cpptrf.o cpptri.o cpptrs.o \ cptcon.o cpteqr.o cptrfs.o cptsv.o cptsvx.o cpttrf.o cpttrs.o cptts2.o \ - crot.o cspcon.o cspmv.o cspr.o csprfs.o cspsv.o \ + crot.o crscl.o cspcon.o cspmv.o cspr.o csprfs.o cspsv.o \ cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \ cstegr.o cstein.o csteqr.o \ csycon.o csymv.o \ @@ -488,7 +488,7 @@ ZLASRC_O = \ zposv.o zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \ zppcon.o zppequ.o zpprfs.o zppsv.o zppsvx.o zpptrf.o zpptri.o zpptrs.o \ zptcon.o zpteqr.o zptrfs.o zptsv.o zptsvx.o zpttrf.o zpttrs.o zptts2.o \ - zrot.o zspcon.o zspmv.o zspr.o zsprfs.o zspsv.o \ + zrot.o zrscl.o zspcon.o zspmv.o zspr.o zsprfs.o zspsv.o \ zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \ zstegr.o zstein.o zsteqr.o \ zsycon.o zsymv.o \ From e3277e134cafff50bfd335da1d3a39a5112cf651 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 5 Jul 2023 14:22:59 +0200 Subject: [PATCH 185/718] Add CRSCL/ZRSCL (Reference-LAPACK PR 839) --- cmake/lapack.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index d339f0ce9..ce5d0831f 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -187,7 +187,7 @@ set(CLASRC cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f - crot.f cspcon.f csprfs.f cspsv.f + crot.f crscl.f cspcon.f csprfs.f cspsv.f cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f cstegr.f cstein.f csteqr.f csycon.f csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f @@ -381,7 +381,7 @@ set(ZLASRC zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f - zrot.f zspcon.f zsprfs.f zspsv.f + zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f zstegr.f zstein.f zsteqr.f zsycon.f zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f From bed3a6a30404ae54770a8b059b32e5f6b0d1fb96 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Jul 2023 10:13:41 +0200 Subject: [PATCH 186/718] Fix segfault when NRHS is zero (Reference-LAPACK PR 876) --- lapack-netlib/SRC/cgelss.f | 12 +++++------- lapack-netlib/SRC/dgelss.f | 11 +++++------ lapack-netlib/SRC/sgelss.f | 11 +++++------ lapack-netlib/SRC/zgelss.f | 16 +++++++--------- 4 files changed, 22 insertions(+), 28 deletions(-) diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f index da6b9092f..d1e38c504 100644 --- a/lapack-netlib/SRC/cgelss.f +++ b/lapack-netlib/SRC/cgelss.f @@ -170,7 +170,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsolve +*> \ingroup gelss * * ===================================================================== SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, @@ -214,8 +214,7 @@ * .. External Subroutines .. EXTERNAL CBDSQR, CCOPY, CGEBRD, CGELQF, CGEMM, CGEMV, $ CGEQRF, CLACPY, CLASCL, CLASET, CSRSCL, CUNGBR, - $ CUNMBR, CUNMLQ, CUNMQR, SLABAD, SLASCL, SLASET, - $ XERBLA + $ CUNMBR, CUNMLQ, CUNMQR, SLASCL, SLASET, XERBLA * .. * .. External Functions .. INTEGER ILAENV @@ -388,7 +387,6 @@ SFMIN = SLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A if max element outside range [SMLNUM,BIGNUM] * @@ -540,7 +538,7 @@ $ LDB, CZERO, WORK, N ) CALL CLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB ) 20 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL CGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 ) CALL CCOPY( N, WORK, 1, B, 1 ) END IF @@ -645,7 +643,7 @@ CALL CLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ), $ LDB ) 40 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL CGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ), $ 1, CZERO, WORK( IWORK ), 1 ) CALL CCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 ) @@ -737,7 +735,7 @@ $ LDB, CZERO, WORK, N ) CALL CLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB ) 60 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL CGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 ) CALL CCOPY( N, WORK, 1, B, 1 ) END IF diff --git a/lapack-netlib/SRC/dgelss.f b/lapack-netlib/SRC/dgelss.f index c4190f2e0..38449be7f 100644 --- a/lapack-netlib/SRC/dgelss.f +++ b/lapack-netlib/SRC/dgelss.f @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup gelss * * ===================================================================== SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, @@ -203,7 +203,7 @@ * .. * .. External Subroutines .. EXTERNAL DBDSQR, DCOPY, DGEBRD, DGELQF, DGEMM, DGEMV, - $ DGEQRF, DLABAD, DLACPY, DLASCL, DLASET, DORGBR, + $ DGEQRF, DLACPY, DLASCL, DLASET, DORGBR, $ DORMBR, DORMLQ, DORMQR, DRSCL, XERBLA * .. * .. External Functions .. @@ -385,7 +385,6 @@ SFMIN = DLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A if max element outside range [SMLNUM,BIGNUM] * @@ -529,7 +528,7 @@ $ LDB, ZERO, WORK, N ) CALL DLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB ) 20 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL DGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 ) CALL DCOPY( N, WORK, 1, B, 1 ) END IF @@ -626,7 +625,7 @@ CALL DLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ), $ LDB ) 40 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL DGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ), $ 1, ZERO, WORK( IWORK ), 1 ) CALL DCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 ) @@ -712,7 +711,7 @@ $ LDB, ZERO, WORK, N ) CALL DLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB ) 60 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL DGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 ) CALL DCOPY( N, WORK, 1, B, 1 ) END IF diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f index 9aed4329f..89d3a6e4f 100644 --- a/lapack-netlib/SRC/sgelss.f +++ b/lapack-netlib/SRC/sgelss.f @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup gelss * * ===================================================================== SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, @@ -202,7 +202,7 @@ * .. * .. External Subroutines .. EXTERNAL SBDSQR, SCOPY, SGEBRD, SGELQF, SGEMM, SGEMV, - $ SGEQRF, SLABAD, SLACPY, SLASCL, SLASET, SORGBR, + $ SGEQRF, SLACPY, SLASCL, SLASET, SORGBR, $ SORMBR, SORMLQ, SORMQR, SRSCL, XERBLA * .. * .. External Functions .. @@ -381,7 +381,6 @@ SFMIN = SLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A if max element outside range [SMLNUM,BIGNUM] * @@ -525,7 +524,7 @@ $ LDB, ZERO, WORK, N ) CALL SLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB ) 20 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL SGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 ) CALL SCOPY( N, WORK, 1, B, 1 ) END IF @@ -622,7 +621,7 @@ CALL SLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ), $ LDB ) 40 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL SGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ), $ 1, ZERO, WORK( IWORK ), 1 ) CALL SCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 ) @@ -708,7 +707,7 @@ $ LDB, ZERO, WORK, N ) CALL SLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB ) 60 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL SGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 ) CALL SCOPY( N, WORK, 1, B, 1 ) END IF diff --git a/lapack-netlib/SRC/zgelss.f b/lapack-netlib/SRC/zgelss.f index be53ba95b..afdbaecf0 100644 --- a/lapack-netlib/SRC/zgelss.f +++ b/lapack-netlib/SRC/zgelss.f @@ -170,7 +170,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsolve +*> \ingroup gelss * * ===================================================================== SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, @@ -212,10 +212,9 @@ COMPLEX*16 DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY, - $ ZDRSCL, ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF, - $ ZLACPY, ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ, - $ ZUNMQR + EXTERNAL DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY, ZDRSCL, + $ ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF, ZLACPY, + $ ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ * .. * .. External Functions .. INTEGER ILAENV @@ -388,7 +387,6 @@ SFMIN = DLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A if max element outside range [SMLNUM,BIGNUM] * @@ -540,7 +538,7 @@ $ LDB, CZERO, WORK, N ) CALL ZLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB ) 20 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL ZGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 ) CALL ZCOPY( N, WORK, 1, B, 1 ) END IF @@ -645,7 +643,7 @@ CALL ZLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ), $ LDB ) 40 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL ZGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ), $ 1, CZERO, WORK( IWORK ), 1 ) CALL ZCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 ) @@ -737,7 +735,7 @@ $ LDB, CZERO, WORK, N ) CALL ZLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB ) 60 CONTINUE - ELSE + ELSE IF( NRHS.EQ.1 ) THEN CALL ZGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 ) CALL ZCOPY( N, WORK, 1, B, 1 ) END IF From 63ec2ad3954347fcca04e2d4687a076205ba19f6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Jul 2023 10:00:05 +0200 Subject: [PATCH 187/718] Fix computation of UPLO in LAPACKE_?larfb (Reference-LAPACK PR 878) --- lapack-netlib/LAPACKE/src/lapacke_clarfb.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dlarfb.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_slarfb.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlarfb.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c index ed12b476e..aac7b551d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c index 545769b83..67bbbd34f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c @@ -60,7 +60,7 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; ldc_t = MAX(1,m); ldt_t = MAX(1,k); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c index f4ddc62a5..aeebd8dec 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c index de444c146..de2f41e66 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c @@ -59,7 +59,7 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; ldc_t = MAX(1,m); ldt_t = MAX(1,k); diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c index d36958f93..3d6c29f88 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c @@ -57,7 +57,7 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c index 8b6127633..72a392a77 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c @@ -59,7 +59,7 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; ldc_t = MAX(1,m); ldt_t = MAX(1,k); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c index 85355b202..c5edbbc0e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c index 72d85ec82..232c8ef58 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c @@ -60,7 +60,7 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); - uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u'; ldc_t = MAX(1,m); ldt_t = MAX(1,k); From 7e93ab1b9e831c5bc229c3b4989f2d813472d928 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Jul 2023 17:00:25 +0200 Subject: [PATCH 188/718] Fix info code returned for invalid ldb --- interface/imatcopy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/imatcopy.c b/interface/imatcopy.c index c4417e99c..4cf0966cc 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -100,13 +100,13 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( order == BlasColMajor) { - if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; - if ( trans == BlasTrans && *ldb < *cols ) info = 9; + if ( trans == BlasNoTrans && *ldb < *rows ) info = 8; + if ( trans == BlasTrans && *ldb < *cols ) info = 8; } if ( order == BlasRowMajor) { - if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; - if ( trans == BlasTrans && *ldb < *rows ) info = 9; + if ( trans == BlasNoTrans && *ldb < *cols ) info = 8; + if ( trans == BlasTrans && *ldb < *rows ) info = 8; } if ( order == BlasColMajor && *lda < *rows ) info = 7; From a721fccfdcbf54ab9d573e8e8259c6272636edc7 Mon Sep 17 00:00:00 2001 From: Felix Yan Date: Tue, 11 Jul 2023 16:34:20 +0300 Subject: [PATCH 189/718] Fix riscv64 detection in system_check.cmake --- cmake/system_check.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index c59e85d54..2b43a5a14 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") set(LOONGARCH64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*") + set(RISCV64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") if (NOT BINARY) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") @@ -107,7 +109,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) + if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64) set(BINARY 64) else () set(BINARY 32) From 35dedb68ce9b6be845ac900fc645d996753ac9e6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Jul 2023 17:07:30 +0200 Subject: [PATCH 190/718] Add C versions of C/ZRSCL --- cmake/lapack.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index ce5d0831f..5c6290484 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -686,7 +686,7 @@ set(CLASRC cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c - crot.c cspcon.c csprfs.c cspsv.c + crot.c crscl.c cspcon.c csprfs.c cspsv.c cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c cstegr.c cstein.c csteqr.c csycon.c csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c @@ -878,7 +878,7 @@ set(ZLASRC zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c - zrot.c zspcon.c zsprfs.c zspsv.c + zrot.c zrscl.c zspcon.c zsprfs.c zspsv.c zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c zstegr.c zstein.c zsteqr.c zsycon.c zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c From afef854863fc48ee6a520636af9e49216f18d82e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Jul 2023 17:08:27 +0200 Subject: [PATCH 191/718] Add C versions of C/ZRSCL --- lapack-netlib/SRC/crscl.c | 735 ++++++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/zrscl.c | 735 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 1470 insertions(+) create mode 100644 lapack-netlib/SRC/crscl.c create mode 100644 lapack-netlib/SRC/zrscl.c diff --git a/lapack-netlib/SRC/crscl.c b/lapack-netlib/SRC/crscl.c new file mode 100644 index 000000000..7c87553d5 --- /dev/null +++ b/lapack-netlib/SRC/crscl.c @@ -0,0 +1,735 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download CRSCL + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CRSCL( N, A, X, INCX ) */ + +/* INTEGER INCX, N */ +/* COMPLEX A */ +/* COMPLEX X( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CRSCL multiplies an n-element complex vector x by the complex scalar */ +/* > 1/a. This is done without overflow or underflow as long as */ +/* > the final result x/a does not overflow or underflow. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of components of the vector x. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX */ +/* > The scalar a which is used to divide each component of x. */ +/* > A must not be 0, or the subroutine will divide by zero. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX array, dimension */ +/* > (1+(N-1)*abs(INCX)) */ +/* > The n-element vector x. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] INCX */ +/* > \verbatim */ +/* > INCX is INTEGER */ +/* > The increment between successive values of the vector X. */ +/* > > 0: X(1) = X(1) and X(1+(i-1)*INCX) = x(i), 1< i<= n */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complexOTHERauxiliary */ + +/* ===================================================================== */ +/* Subroutine */ int crscl_(integer *n, complex *a, complex *x, integer *incx) +{ + /* System generated locals */ + real r__1, r__2; + complex q__1; + + /* Local variables */ + real absi, absr; + extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + integer *); + real ai, ar, ui, ov, ur; + extern real slamch_(char *); + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + *); + real safmin, safmax; + extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer + *); + + +/* -- LAPACK auxiliary routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Quick return if possible */ + + /* Parameter adjustments */ + --x; + + /* Function Body */ + if (*n <= 0) { + return 0; + } + +/* Get machine parameters */ + + safmin = slamch_("S"); + safmax = 1.f / safmin; + ov = slamch_("O"); + +/* Initialize constants related to A. */ + + ar = a->r; + ai = r_imag(a); + absr = abs(ar); + absi = abs(ai); + + if (ai == 0.f) { +/* If alpha is real, then we can use csrscl */ + csrscl_(n, &ar, &x[1], incx); + + } else if (ar == 0.f) { +/* If alpha has a zero real part, then we follow the same rules as if */ +/* alpha were real. */ + if (absi > safmax) { + csscal_(n, &safmin, &x[1], incx); + r__1 = -safmax / ai; + q__1.r = 0.f, q__1.i = r__1; + cscal_(n, &q__1, &x[1], incx); + } else if (absi < safmin) { + r__1 = -safmin / ai; + q__1.r = 0.f, q__1.i = r__1; + cscal_(n, &q__1, &x[1], incx); + csscal_(n, &safmax, &x[1], incx); + } else { + r__1 = -1.f / ai; + q__1.r = 0.f, q__1.i = r__1; + cscal_(n, &q__1, &x[1], incx); + } + + } else { +/* The following numbers can be computed. */ +/* They are the inverse of the real and imaginary parts of 1/alpha. */ +/* Note that a and b are always different from zero. */ +/* NaNs are only possible if either: */ +/* 1. alphaR or alphaI is NaN. */ +/* 2. alphaR and alphaI are both infinite, in which case it makes sense */ +/* to propagate a NaN. */ + ur = ar + ai * (ai / ar); + ui = ai + ar * (ar / ai); + + if (abs(ur) < safmin || abs(ui) < safmin) { +/* This means that both alphaR and alphaI are very small. */ + r__1 = safmin / ur; + r__2 = -safmin / ui; + q__1.r = r__1, q__1.i = r__2; + cscal_(n, &q__1, &x[1], incx); + csscal_(n, &safmax, &x[1], incx); + } else if (abs(ur) > safmax || abs(ui) > safmax) { + if (absr > ov || absi > ov) { +/* This means that a and b are both Inf. No need for scaling. */ + r__1 = 1.f / ur; + r__2 = -1.f / ui; + q__1.r = r__1, q__1.i = r__2; + cscal_(n, &q__1, &x[1], incx); + } else { + csscal_(n, &safmin, &x[1], incx); + if (abs(ur) > ov || abs(ui) > ov) { +/* Infs were generated. We do proper scaling to avoid them. */ + if (absr >= absi) { +/* ABS( UR ) <= ABS( UI ) */ + ur = safmin * ar + safmin * (ai * (ai / ar)); + ui = safmin * ai + ar * (safmin * ar / ai); + } else { +/* ABS( UR ) > ABS( UI ) */ + ur = safmin * ar + ai * (safmin * ai / ar); + ui = safmin * ai + safmin * (ar * (ar / ai)); + } + r__1 = 1.f / ur; + r__2 = -1.f / ui; + q__1.r = r__1, q__1.i = r__2; + cscal_(n, &q__1, &x[1], incx); + } else { + r__1 = safmax / ur; + r__2 = -safmax / ui; + q__1.r = r__1, q__1.i = r__2; + cscal_(n, &q__1, &x[1], incx); + } + } + } else { + r__1 = 1.f / ur; + r__2 = -1.f / ui; + q__1.r = r__1, q__1.i = r__2; + cscal_(n, &q__1, &x[1], incx); + } + } + + return 0; + +/* End of CRSCL */ + +} /* crscl_ */ + diff --git a/lapack-netlib/SRC/zrscl.c b/lapack-netlib/SRC/zrscl.c new file mode 100644 index 000000000..2264b5465 --- /dev/null +++ b/lapack-netlib/SRC/zrscl.c @@ -0,0 +1,735 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download ZDRSCL + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZRSCL( N, A, X, INCX ) */ + +/* INTEGER INCX, N */ +/* COMPLEX*16 A */ +/* COMPLEX*16 X( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZRSCL multiplies an n-element complex vector x by the complex scalar */ +/* > 1/a. This is done without overflow or underflow as long as */ +/* > the final result x/a does not overflow or underflow. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of components of the vector x. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 */ +/* > The scalar a which is used to divide each component of x. */ +/* > A must not be 0, or the subroutine will divide by zero. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX*16 array, dimension */ +/* > (1+(N-1)*abs(INCX)) */ +/* > The n-element vector x. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] INCX */ +/* > \verbatim */ +/* > INCX is INTEGER */ +/* > The increment between successive values of the vector SX. */ +/* > > 0: SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i), 1< i<= n */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex16OTHERauxiliary */ + +/* ===================================================================== */ +/* Subroutine */ int zrscl_(integer *n, doublecomplex *a, doublecomplex *x, + integer *incx) +{ + /* System generated locals */ + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + doublereal absi, absr; + extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + doublecomplex *, integer *); + doublereal ai, ar; + extern doublereal dlamch_(char *); + doublereal ui, ov, ur, safmin, safmax; + extern /* Subroutine */ int zdscal_(integer *, doublereal *, + doublecomplex *, integer *), zdrscl_(integer *, doublereal *, + doublecomplex *, integer *); + + +/* -- LAPACK auxiliary routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Quick return if possible */ + + /* Parameter adjustments */ + --x; + + /* Function Body */ + if (*n <= 0) { + return 0; + } + +/* Get machine parameters */ + + safmin = dlamch_("S"); + safmax = 1. / safmin; + ov = dlamch_("O"); + +/* Initialize constants related to A. */ + + ar = a->r; + ai = d_imag(a); + absr = abs(ar); + absi = abs(ai); + + if (ai == 0.) { +/* If alpha is real, then we can use csrscl */ + zdrscl_(n, &ar, &x[1], incx); + + } else if (ar == 0.) { +/* If alpha has a zero real part, then we follow the same rules as if */ +/* alpha were real. */ + if (absi > safmax) { + zdscal_(n, &safmin, &x[1], incx); + d__1 = -safmax / ai; + z__1.r = 0., z__1.i = d__1; + zscal_(n, &z__1, &x[1], incx); + } else if (absi < safmin) { + d__1 = -safmin / ai; + z__1.r = 0., z__1.i = d__1; + zscal_(n, &z__1, &x[1], incx); + zdscal_(n, &safmax, &x[1], incx); + } else { + d__1 = -1. / ai; + z__1.r = 0., z__1.i = d__1; + zscal_(n, &z__1, &x[1], incx); + } + + } else { +/* The following numbers can be computed. */ +/* They are the inverse of the real and imaginary parts of 1/alpha. */ +/* Note that a and b are always different from zero. */ +/* NaNs are only possible if either: */ +/* 1. alphaR or alphaI is NaN. */ +/* 2. alphaR and alphaI are both infinite, in which case it makes sense */ +/* to propagate a NaN. */ + ur = ar + ai * (ai / ar); + ui = ai + ar * (ar / ai); + + if (abs(ur) < safmin || abs(ui) < safmin) { +/* This means that both alphaR and alphaI are very small. */ + d__1 = safmin / ur; + d__2 = -safmin / ui; + z__1.r = d__1, z__1.i = d__2; + zscal_(n, &z__1, &x[1], incx); + zdscal_(n, &safmax, &x[1], incx); + } else if (abs(ur) > safmax || abs(ui) > safmax) { + if (absr > ov || absi > ov) { +/* This means that a and b are both Inf. No need for scaling. */ + d__1 = 1. / ur; + d__2 = -1. / ui; + z__1.r = d__1, z__1.i = d__2; + zscal_(n, &z__1, &x[1], incx); + } else { + zdscal_(n, &safmin, &x[1], incx); + if (abs(ur) > ov || abs(ui) > ov) { +/* Infs were generated. We do proper scaling to avoid them. */ + if (absr >= absi) { +/* ABS( UR ) <= ABS( UI ) */ + ur = safmin * ar + safmin * (ai * (ai / ar)); + ui = safmin * ai + ar * (safmin * ar / ai); + } else { +/* ABS( UR ) > ABS( UI ) */ + ur = safmin * ar + ai * (safmin * ai / ar); + ui = safmin * ai + safmin * (ar * (ar / ai)); + } + d__1 = 1. / ur; + d__2 = -1. / ui; + z__1.r = d__1, z__1.i = d__2; + zscal_(n, &z__1, &x[1], incx); + } else { + d__1 = safmax / ur; + d__2 = -safmax / ui; + z__1.r = d__1, z__1.i = d__2; + zscal_(n, &z__1, &x[1], incx); + } + } + } else { + d__1 = 1. / ur; + d__2 = -1. / ui; + z__1.r = d__1, z__1.i = d__2; + zscal_(n, &z__1, &x[1], incx); + } + } + + return 0; + +/* End of ZRSCL */ + +} /* zrscl_ */ + From 1e4a3a2b5e111a6a94eb53946fa92c1715c5dd5e Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Wed, 12 Jul 2023 12:55:50 +0100 Subject: [PATCH 192/718] Fixes RVV masked intrinsics for izamax/izamin kernels --- kernel/riscv64/izamax_rvv.c | 18 +++++++++--------- kernel/riscv64/izamin_rvv.c | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index e61d0cbec..e43ded820 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -48,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -77,9 +77,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -116,8 +116,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); @@ -138,9 +138,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl); - + v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, vl); } diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index 297b3c99a..cc3c37c8e 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -47,9 +47,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -74,9 +74,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_m +#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_m +#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -113,8 +113,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); @@ -136,8 +136,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, vl); From 45b2cd2fb299efdda04e539467fc3b15c2bd6d63 Mon Sep 17 00:00:00 2001 From: Andy Mroczkowski Date: Wed, 12 Jul 2023 09:37:45 -0400 Subject: [PATCH 193/718] treat armv8 CMAKE_SYSTEM_PROCESSOR as arm64 The cmake scripts incorrectly treated armv8 as 32-bit arm, causing compilation issues. This just adds 'armv8' to the arm64 condition check. --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index fdc79c8ce..bf3c23d23 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -54,7 +54,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)") if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") set(ARM64 1) else() From e08743d9771c96afa6dce3d2326aab3554db6337 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Jul 2023 23:02:36 +0200 Subject: [PATCH 194/718] Update to use safe scaling algorithm from Reference-LAPACK PR 527 --- interface/rotg.c | 62 ++++++++++++++++++++++++++++++++++------------- interface/zrotg.c | 40 ++++++++++++++++++++++++++---- 2 files changed, 80 insertions(+), 22 deletions(-) diff --git a/interface/rotg.c b/interface/rotg.c index 69443a5a0..3ccf4f7eb 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -1,9 +1,11 @@ #include +#include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif + #ifndef CBLAS void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ @@ -14,17 +16,27 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ #endif +#ifdef DOUBLE + long double safmin = DBL_MIN; +#else + long double safmin = FLT_MIN; +#endif + #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da = *DA; long double db = *DB; long double c; long double s; - long double r, roe, z; + long double r, z; + long double sigma, dascal,dbscal; long double ada = fabsl(da); long double adb = fabsl(db); - long double scale = ada + adb; + long double maxab = MAX(ada,adb); + long double safmax; + long double scale; + #ifndef CBLAS PRINT_DEBUG_NAME; @@ -32,17 +44,25 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ PRINT_DEBUG_CNAME; #endif - roe = db; - if (ada > adb) roe = da; - - if (scale == ZERO) { + if (adb == ZERO) { *C = ONE; *S = ZERO; - *DA = ZERO; *DB = ZERO; + } else if (ada == ZERO) { + *C = ZERO; + *S = ONE; + *DA = *DB; + *DB = ONE; } else { - r = sqrt(da * da + db * db); - if (roe < 0) r = -r; + safmax = 1./safmin; + scale = MIN(MAX(safmin,maxab), safmax); + if (ada > adb) + sigma = copysign(1.,da); + else + sigma = copysign(1.,db); + dascal = da / scale; + dbscal = db / scale; + r = sigma * (scale * sqrt(dascal * dascal + dbscal * dbscal)); c = da / r; s = db / r; z = ONE; @@ -65,11 +85,22 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ FLOAT db = *DB; FLOAT c = *C; FLOAT s = *S; - FLOAT r, roe, z; + FLOAT sigma; + FLOAT r, z; FLOAT ada = fabs(da); FLOAT adb = fabs(db); - FLOAT scale = ada + adb; + FLOAT maxab = MAX(ada,adb); + long double safmax ; + FLOAT scale ; + + safmax = 1./safmin; + scale = MIN(MAX(safmin,maxab), safmax); + + if (ada > adb) + sigma = sign(1.,da); + else + sigma = sign(1.,db); #ifndef CBLAS PRINT_DEBUG_NAME; @@ -77,20 +108,17 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ PRINT_DEBUG_CNAME; #endif - roe = db; - if (ada > adb) roe = da; - if (scale == ZERO) { + if (adb == ZERO) { *C = ONE; *S = ZERO; - *DA = ZERO; + DA = ZERO; *DB = ZERO; } else { FLOAT aa = da / scale; FLOAT bb = db / scale; - r = scale * sqrt(aa * aa + bb * bb); - if (roe < 0) r = -r; + r = sigma * scale * sqrt(aa * aa + bb * bb); c = da / r; s = db / r; z = ONE; diff --git a/interface/zrotg.c b/interface/zrotg.c index 123f4da85..7c4e2ed08 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -1,9 +1,11 @@ #include +#include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" #endif + #ifndef CBLAS void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ @@ -14,6 +16,12 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { FLOAT *S = (FLOAT*) VS; #endif /* CBLAS */ +#ifdef DOUBLE + long double safmin = DBL_MIN; +#else + long double safmin = FLT_MIN; +#endif + #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da_r = *(DA + 0); @@ -23,6 +31,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { long double r; long double ada = fabsl(da_r) + fabsl(da_i); + long double adb = sqrt(db_r * db_r + db_i * db_i); PRINT_DEBUG_NAME; @@ -38,10 +47,24 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { *(DA + 1) = db_i; } else { long double alpha_r, alpha_i; + long double safmax = 1./safmin; + long double sigma; + long double maxab = MAX(ada,adb); + long double scale = MIN(MAX(safmin,maxab), safmax); - ada = sqrt(da_r * da_r + da_i * da_i); - r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i); + long double aa_r = da_r / scale; + long double aa_i = da_i / scale; + long double bb_r = db_r / scale; + long double bb_i = db_i / scale; + + if (ada > adb) + sigma = copysign(1.,da_r); + else + sigma = copysign(1.,db_r); + + r = sigma * scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i); + alpha_r = da_r / ada; alpha_i = da_i / ada; @@ -60,7 +83,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { FLOAT r; FLOAT ada = fabs(da_r) + fabs(da_i); - FLOAT adb; + FLOAT ada = fabs(db_r) + fabs(db_i); PRINT_DEBUG_NAME; @@ -75,6 +98,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { *(DA + 0) = db_r; *(DA + 1) = db_i; } else { + long double safmax = 1./safmin; FLOAT scale; FLOAT aa_r, aa_i, bb_r, bb_i; FLOAT alpha_r, alpha_i; @@ -108,14 +132,20 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { scale = (bb_i / bb_r); adb = bb_r * sqrt(ONE + scale * scale); } - scale = ada + adb; + FLOAT maxab = MAX(ada,adb); + scale = MIN(MAX(safmin,maxab), safmax); aa_r = da_r / scale; aa_i = da_i / scale; bb_r = db_r / scale; bb_i = db_i / scale; - r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i); + if (ada > adb) + sigma = copysign(1.,da_r); + else + sigma = copysign(1.,db_r); + + r = sigma * scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i); alpha_r = da_r / ada; alpha_i = da_i / ada; From affeef0b9c1ef99fca1a97dd4635e4f6552f6b71 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Jul 2023 08:38:03 +0200 Subject: [PATCH 195/718] Fix gmake build not always picking the right ARM64 arch options for clang (#4136) * Fix gcc version checks erroneously excluding clang * Avoid some mtune names not supported by (Apple)Clang --- Makefile.arm64 | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 064e84cbb..1b10446f7 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -69,7 +69,7 @@ endif # in GCC>=9 ifeq ($(CORE), NEOVERSEN1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 @@ -92,9 +92,14 @@ endif # in GCC>=10.4 ifeq ($(CORE), NEOVERSEV1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ10), 1) -ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) -CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1 +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) +CCOMMON_OPT += -march=armv8.4-a+sve +ifeq (1, $(ISCLANG)) +CCOMMON_OPT += -mtune=cortex-x1 +else +CCOMMON_OPT += -mtune=neoverse-v1 +endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 endif @@ -122,8 +127,8 @@ endif # in GCC>=10.4 ifeq ($(CORE), NEOVERSEN2) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ10), 1) -ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 else @@ -155,7 +160,7 @@ endif # Use a53 tunings because a55 is only available in GCC>=8.1 ifeq ($(CORE), CORTEXA55) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ8), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG))) CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 @@ -196,8 +201,13 @@ endif endif ifeq ($(CORE), THUNDERX3T110) -ifeq ($(GCCVERSIONGTEQ10), 1) -CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) +CCOMMON_OPT += -march=armv8.3-a +ifeq (0, $(ISCLANG)) +CCOMMON_OPT += -mtune=thunderx3t110 +else +CCOMMON_OPT += -mtune=thunderx2t99 +endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 endif @@ -225,9 +235,12 @@ endif endif endif -ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) ifeq ($(CORE), EMAG8180) -CCOMMON_OPT += -march=armv8-a -mtune=emag +CCOMMON_OPT += -march=armv8-a +ifeq ($(ISCLANG), 0) +CCOMMON_OPT += -mtune=emag +endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=emag endif From 0f2ce93904272255a0e0a60d8d296ab5974ef235 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Jul 2023 10:56:59 +0200 Subject: [PATCH 196/718] typo fix --- interface/rotg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/rotg.c b/interface/rotg.c index 3ccf4f7eb..530dce16a 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -112,7 +112,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ if (adb == ZERO) { *C = ONE; *S = ZERO; - DA = ZERO; + *DA = ZERO; *DB = ZERO; } else { FLOAT aa = da / scale; From 7c75c8b2fe3fc4b2e6c5c09f63d7ce2b8d5b5c18 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Jul 2023 21:40:12 +0200 Subject: [PATCH 197/718] fix truncated edit --- interface/rotg.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/interface/rotg.c b/interface/rotg.c index 530dce16a..8d6df531a 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -112,8 +112,12 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ if (adb == ZERO) { *C = ONE; *S = ZERO; - *DA = ZERO; *DB = ZERO; + else if (ada == ZERO) { + *C = ZERO; + *S = ONE; + *DA = *DB; + *DB = ONE; } else { FLOAT aa = da / scale; FLOAT bb = db / scale; From 9567305e4c4711c49530917419dc391c00d414b7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Jul 2023 23:21:18 +0200 Subject: [PATCH 198/718] Restore initialization of data01,data02 --- kernel/generic/ztrsm_utcopy_1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c index 08f85e891..5833a64ef 100644 --- a/kernel/generic/ztrsm_utcopy_1.c +++ b/kernel/generic/ztrsm_utcopy_1.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02; + FLOAT data01=0.0, data02=0.0; FLOAT *a1; lda *= 2; From cfa0a80664591d0243903d626aefec922cb69e6a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Jul 2023 23:23:12 +0200 Subject: [PATCH 199/718] Restore initialization of data variables --- kernel/generic/ztrsm_utcopy_2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c index 387bb2532..bc495f7c6 100644 --- a/kernel/generic/ztrsm_utcopy_2.c +++ b/kernel/generic/ztrsm_utcopy_2.c @@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; + data01=data02=data07=data08=0.0; lda *= 2; jj = offset; From 5e1103b8d7cd3ff60dc7ebddbcc8f47c9c927f98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 Jul 2023 23:35:38 +0200 Subject: [PATCH 200/718] Update rotg.c --- interface/rotg.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/interface/rotg.c b/interface/rotg.c index 8d6df531a..8d40d9c53 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -98,9 +98,9 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ scale = MIN(MAX(safmin,maxab), safmax); if (ada > adb) - sigma = sign(1.,da); + sigma = copysign(1.,da); else - sigma = sign(1.,db); + sigma = copysign(1.,db); #ifndef CBLAS PRINT_DEBUG_NAME; @@ -113,7 +113,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ *C = ONE; *S = ZERO; *DB = ZERO; - else if (ada == ZERO) { + } else if (ada == ZERO) { *C = ZERO; *S = ONE; *DA = *DB; From 04cdf5efb4a13010874bed854df4264a5730819e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Jul 2023 00:05:00 +0200 Subject: [PATCH 201/718] fix typo and missing declaration --- interface/zrotg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/zrotg.c b/interface/zrotg.c index 7c4e2ed08..dd765f05f 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -83,7 +83,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { FLOAT r; FLOAT ada = fabs(da_r) + fabs(da_i); - FLOAT ada = fabs(db_r) + fabs(db_i); + FLOAT adb = fabs(db_r) + fabs(db_i); PRINT_DEBUG_NAME; @@ -99,7 +99,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { *(DA + 1) = db_i; } else { long double safmax = 1./safmin; - FLOAT scale; + FLOAT scale, sigma; FLOAT aa_r, aa_i, bb_r, bb_i; FLOAT alpha_r, alpha_i; From 3d31191b0fc5f25946b45bf16708aacd0e61c40b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Jul 2023 11:06:48 +0200 Subject: [PATCH 202/718] Work around Clang failing to disambiguate SVE intrinsics and add AppleClang crossbuild to MacOS/arm64 DYNAMIC_ARCH in AzureCI (#4140) * Add AppleClang crossbuild to MacOS/arm64 DYNAMIC_ARCH * add casts to disambiguate svwhilelt for clang --- azure-pipelines.yml | 13 +++++++++++++ kernel/arm64/dot_kernel_sve.c | 4 ++-- kernel/arm64/gemm_ncopy_sve_v1x8.c | 2 +- kernel/arm64/gemm_tcopy_sve_v1x8.c | 2 +- kernel/arm64/trmm_lncopy_sve_v1.c | 8 ++++---- kernel/arm64/trmm_ltcopy_sve_v1.c | 8 ++++---- kernel/arm64/trmm_uncopy_sve_v1.c | 8 ++++---- kernel/arm64/trmm_utcopy_sve_v1.c | 8 ++++---- kernel/arm64/trsm_lncopy_sve.c | 8 ++++---- 9 files changed, 37 insertions(+), 24 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 65ef538e9..ff56ad00b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -271,6 +271,19 @@ jobs: - script: | make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 +- job: OSX_xbuild_DYNAMIC_ARM64 + pool: + vmImage: 'macOS-11' + variables: + CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64 + steps: + - script: | + ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs + /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus + /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version + make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + - job: ALPINE_MUSL pool: vmImage: 'ubuntu-latest' diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index 8460e0d5e..9c057551e 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG sve_width = SVE_WIDTH; for (BLASLONG i = 0; i < n; i += sve_width * 2) { - svbool_t pg_a = SVE_WHILELT(i, n); - svbool_t pg_b = SVE_WHILELT(i + sve_width, n); + svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); + svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); diff --git a/kernel/arm64/gemm_ncopy_sve_v1x8.c b/kernel/arm64/gemm_ncopy_sve_v1x8.c index 113b1ee40..7b2a2e767 100644 --- a/kernel/arm64/gemm_ncopy_sve_v1x8.c +++ b/kernel/arm64/gemm_ncopy_sve_v1x8.c @@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { BLASLONG remaining_n = n - single_vectors_n; if (remaining_n) { a_offset_inner = a_offset; - svbool_t pg = SV_WHILE(0L, remaining_n); + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); uint64_t active = remaining_n; uint64_t i_cnt = m >> 2; while (i_cnt--) { diff --git a/kernel/arm64/gemm_tcopy_sve_v1x8.c b/kernel/arm64/gemm_tcopy_sve_v1x8.c index 68a2cc07c..9a93b6cb7 100644 --- a/kernel/arm64/gemm_tcopy_sve_v1x8.c +++ b/kernel/arm64/gemm_tcopy_sve_v1x8.c @@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG remaining_n = n - single_vectors_n; if (remaining_n) { a_offset_inner = a_offset; - svbool_t pg = SV_WHILE(0L, remaining_n); + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); uint64_t active = remaining_n; uint64_t i_cnt = m >> 2; while (i_cnt--) { diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c index 918e945ac..c7f79e3fd 100644 --- a/kernel/arm64/trmm_lncopy_sve_v1.c +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c index b76cc56de..b3ba68973 100644 --- a/kernel/arm64/trmm_ltcopy_sve_v1.c +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c index 75fa163ae..a47d2096c 100644 --- a/kernel/arm64/trmm_uncopy_sve_v1.c +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index 36a03242a..c5188beb4 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c index 5a9d4194a..2895eb85d 100644 --- a/kernel/arm64/trsm_lncopy_sve.c +++ b/kernel/arm64/trsm_lncopy_sve.c @@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif From 4c43d1eebadf32956efc7a7deb172a6259dd1ace Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Jul 2023 07:47:19 +0200 Subject: [PATCH 203/718] Fix C prototypes and LAPACKE headers for ?GEDMD/?GEDMDQ (#4134) * Fix prototypes for ?GEDMD/?GEDMDQ and their LAPACKE interfaces --- lapack-netlib/LAPACKE/include/lapack.h | 176 +++++++++++++----- lapack-netlib/LAPACKE/include/lapacke.h | 106 +++++------ lapack-netlib/LAPACKE/src/lapacke_cgedmd.c | 52 ++++-- .../LAPACKE/src/lapacke_cgedmd_work.c | 32 ++-- lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c | 42 +++-- .../LAPACKE/src/lapacke_cgedmdq_work.c | 30 +-- lapack-netlib/LAPACKE/src/lapacke_dgedmd.c | 28 +-- .../LAPACKE/src/lapacke_dgedmd_work.c | 38 ++-- lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c | 2 +- .../LAPACKE/src/lapacke_dgedmdq_work.c | 14 +- lapack-netlib/LAPACKE/src/lapacke_sgedmd.c | 30 +-- .../LAPACKE/src/lapacke_sgedmd_work.c | 39 ++-- lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c | 2 +- .../LAPACKE/src/lapacke_sgedmdq_work.c | 14 +- lapack-netlib/LAPACKE/src/lapacke_zgedmd.c | 54 +++--- .../LAPACKE/src/lapacke_zgedmd_work.c | 36 ++-- lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c | 44 +++-- .../LAPACKE/src/lapacke_zgedmdq_work.c | 28 +-- 18 files changed, 454 insertions(+), 313 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index f510c8c80..28f8ad655 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3323,68 +3323,107 @@ void LAPACK_zgesdd_base( #define LAPACK_zgesdd(...) LAPACK_zgesdd_base(__VA_ARGS__) #endif -#define LAPACK_cgedmd LAPACK_GLOBAL(cgedmd,CGEDMD) -void LAPACK_cgedmd( - char const* jobs, char const* jobz, char const* jobf, +#define LAPACK_cgedmd_base LAPACK_GLOBAL(cgedmd,CGEDMD) +void LAPACK_cgedmd_base( + char const* jobs, char const* jobz, char const* jobr, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, lapack_complex_float* x, lapack_int const* ldx, - lapack_complex_float* y, lapack_int const* ldy, lapack_int const* k, - lapack_complex_float* reig, lapack_complex_float* imeig, - lapack_complex_float* z, lapack_int const* ldz, lapack_complex_float* res, + lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk, + const float* tol, lapack_int* k, lapack_complex_float* eigs, + lapack_complex_float* z, lapack_int const* ldz, float* res, lapack_complex_float* b, lapack_int const* ldb, lapack_complex_float* w, lapack_int const* ldw, lapack_complex_float* s, lapack_int const* lds, - lapack_complex_float* work, lapack_int const* lwork, + lapack_complex_float* zwork, lapack_int const* lzwork, + float* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_cgedmd(...) LAPACK_cgedmd_base(__VA_ARGS__, 1, 1, 1, 1) +#else + #define LAPACK_cgedmd(...) LAPACK_cgedmd_base(__VA_ARGS__) +#endif + -#define LAPACK_dgedmd LAPACK_GLOBAL(dgedmd,DGEDMD) -void LAPACK_dgedmd( - char const* jobs, char const* jobz, char const* jobf, +#define LAPACK_dgedmd_base LAPACK_GLOBAL(dgedmd,DGEDMD) +void LAPACK_dgedmd_base( + char const* jobs, char const* jobz, char const* jobr, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, double* x, lapack_int const* ldx, - double* y, lapack_int const* ldy, lapack_int const* k, - double* reig, double* imeig, + double* y, lapack_int const* ldy, lapack_int const* nrnk, + const double* tol, lapack_int* k, double* reig, double* imeig, double* z, lapack_int const* ldz, double* res, double* b, lapack_int const* ldb, double* w, lapack_int const* ldw, double* s, lapack_int const* lds, double* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_dgedmd(...) LAPACK_dgedmd_base(__VA_ARGS__, 1, 1, 1, 1) +#else + #define LAPACK_dgedmd(...) LAPACK_dgedmd_base(__VA_ARGS__) +#endif -#define LAPACK_sgedmd LAPACK_GLOBAL(sgedmd,SGEDMD) -void LAPACK_sgedmd( - char const* jobs, char const* jobz, char const* jobf, +#define LAPACK_sgedmd_base LAPACK_GLOBAL(sgedmd,SGEDMD) +void LAPACK_sgedmd_base( + char const* jobs, char const* jobz, char const* jobr, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, float* x, lapack_int const* ldx, - float* y, lapack_int const* ldy, lapack_int const* k, - float* reig, float* imeig, + float* y, lapack_int const* ldy, lapack_int const* nrnk, + const float* tol, lapack_int* k, float* reig, float *imeig, float* z, lapack_int const* ldz, float* res, float* b, lapack_int const* ldb, float* w, lapack_int const* ldw, float* s, lapack_int const* lds, float* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_sgedmd(...) LAPACK_sgedmd_base(__VA_ARGS__, 1, 1, 1, 1) +#else + #define LAPACK_sgedmd(...) LAPACK_sgedmd_base(__VA_ARGS__) +#endif -#define LAPACK_zgedmd LAPACK_GLOBAL(zgedmd,ZGEDMD) -void LAPACK_zgedmd( - char const* jobs, char const* jobz, char const* jobf, +#define LAPACK_zgedmd_base LAPACK_GLOBAL(zgedmd,ZGEDMD) +void LAPACK_zgedmd_base( + char const* jobs, char const* jobz, char const* jobr, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, lapack_complex_double* x, lapack_int const* ldx, - lapack_complex_double* y, lapack_int const* ldy, lapack_int const* k, - lapack_complex_double* reig, lapack_complex_double* imeig, - lapack_complex_double* z, lapack_int const* ldz, lapack_complex_double* res, + lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk, + const double* tol, lapack_int *k, lapack_complex_double* eigs, + lapack_complex_double* z, lapack_int const* ldz, double* res, lapack_complex_double* b, lapack_int const* ldb, lapack_complex_double* w, lapack_int const* ldw, lapack_complex_double* s, lapack_int const* lds, - lapack_complex_double* work, lapack_int const* lwork, + lapack_complex_double* zwork, lapack_int const* lzwork, + double* rwork, lapack_int const* lrwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_zgedmd(...) LAPACK_zgedmd_base(__VA_ARGS__, 1, 1, 1, 1) +#else + #define LAPACK_zgedmd(...) LAPACK_zgedmd_base(__VA_ARGS__) +#endif -#define LAPACK_cgedmdq LAPACK_GLOBAL(cgedmdq,CGEDMDQ) -void LAPACK_cgedmdq( +#define LAPACK_cgedmdq_base LAPACK_GLOBAL(cgedmdq,CGEDMDQ) +void LAPACK_cgedmdq_base( char const* jobs, char const* jobz, char const* jobr, char const* jobq, char const* jobt, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, @@ -3392,35 +3431,54 @@ void LAPACK_cgedmdq( lapack_complex_float* x, lapack_int const* ldx, lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk, float const* tol, lapack_int const* k, - lapack_complex_float* reig, lapack_complex_float* imeig, - lapack_complex_float* z, lapack_int const* ldz, lapack_complex_float* res, + lapack_complex_float* eigs, + lapack_complex_float* z, lapack_int const* ldz, float* res, lapack_complex_float* b, lapack_int const* ldb, lapack_complex_float* v, lapack_int const* ldv, lapack_complex_float* s, lapack_int const* lds, - lapack_complex_float* work, lapack_int const* lwork, + lapack_complex_float* zwork, lapack_int const* lzwork, + float* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_cgedmdq(...) LAPACK_cgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1) +#else + #define LAPACK_cgedmdq(...) LAPACK_cgedmdq_base(__VA_ARGS__) +#endif -#define LAPACK_dgedmdq LAPACK_GLOBAL(dgedmdq,DGEDMDQ) -void LAPACK_dgedmdq( +#define LAPACK_dgedmdq_base LAPACK_GLOBAL(dgedmdq,DGEDMDQ) +void LAPACK_dgedmdq_base( char const* jobs, char const* jobz, char const* jobr, char const* jobq, char const* jobt, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, double* f, lapack_int const* ldf, double* x, lapack_int const* ldx, double* y, lapack_int const* ldy, lapack_int const* nrnk, - double const* tol, lapack_int const* k, - double* reig, double* imeig, + double const* tol, lapack_int* k, + double* reig, double *imeig, double* z, lapack_int const* ldz, double* res, double* b, lapack_int const* ldb, double* v, lapack_int const* ldv, double* s, lapack_int const* lds, double* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_dgedmdq(...) LAPACK_dgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1) +#else + #define LAPACK_dgedmdq(...) LAPACK_dgedmdq_base(__VA_ARGS__) +#endif -#define LAPACK_sgedmdq LAPACK_GLOBAL(sgedmdq,SGEDMDQ) -void LAPACK_sgedmdq( +#define LAPACK_sgedmdq_base LAPACK_GLOBAL(sgedmdq,SGEDMDQ) +void LAPACK_sgedmdq_base( char const* jobs, char const* jobz, char const* jobr, char const* jobq, char const* jobt, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, @@ -3435,10 +3493,19 @@ void LAPACK_sgedmdq( float* s, lapack_int const* lds, float* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_sgedmdq(...) LAPACK_sgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1) +#else + #define LAPACK_sgedmdq(...) LAPACK_sgedmdq_base(__VA_ARGS__) +#endif -#define LAPACK_zgedmdq LAPACK_GLOBAL(zgedmdq,ZGEDMDQ) -void LAPACK_zgedmdq( +#define LAPACK_zgedmdq_base LAPACK_GLOBAL(zgedmdq,ZGEDMDQ) +void LAPACK_zgedmdq_base( char const* jobs, char const* jobz, char const* jobr, char const* jobq, char const* jobt, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, @@ -3446,14 +3513,25 @@ void LAPACK_zgedmdq( lapack_complex_double* x, lapack_int const* ldx, lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk, double const* tol, lapack_int const* k, - lapack_complex_double* reig, lapack_complex_double* imeig, - lapack_complex_double* z, lapack_int const* ldz, lapack_complex_double* res, + lapack_complex_double* eigs, + lapack_complex_double* z, lapack_int const* ldz, double* res, lapack_complex_double* b, lapack_int const* ldb, lapack_complex_double* v, lapack_int const* ldv, lapack_complex_double* s, lapack_int const* lds, - lapack_complex_double* work, lapack_int const* lwork, + lapack_complex_double* zwork, lapack_int const* lzwork, + double* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info + +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_zgedmdq(...) LAPACK_zgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1) +#else + #define LAPACK_zgedmdq(...) LAPACK_zgedmdq_base(__VA_ARGS__) +#endif #define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV) lapack_int LAPACK_cgesv( @@ -21649,7 +21727,7 @@ void LAPACK_ztrevc_base( #endif ); #ifdef LAPACK_FORTRAN_STRLEN_END - #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__, 1, 1) + #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__, (size_t)1, 1) #else #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__) #endif diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 9a9ab4753..377e2a6bb 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -5713,55 +5713,59 @@ lapack_int LAPACKE_zgesdd_work( int matrix_layout, char jobz, lapack_int m, double* rwork, lapack_int* iwork ); lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, float* x, lapack_int ldx, - float* y, lapack_int ldy, lapack_int k, - float* reig, float* imeig, float* z, - lapack_int ldz, float* res, float* b, - lapack_int ldb, float* w, lapack_int ldw, - float* s, lapack_int lds, float* work, - lapack_int lwork, lapack_int* iwork, - lapack_int liwork ); + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, float* x, + lapack_int ldx, float* y, lapack_int ldy, + lapack_int nrnk, float* tol, lapack_int k, + float* reig, float* imeig, + float* z, lapack_int ldz, float* res, + float* b, lapack_int ldb, float* w, + lapack_int ldw, float* s, lapack_int lds, + float* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ); lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, double* x, lapack_int ldx, - double* y, lapack_int ldy, lapack_int k, - double* reig, double* imeig, double* z, - lapack_int ldz, double* res, double* b, - lapack_int ldb, double* w, lapack_int ldw, - double* s, lapack_int lds, double* work, - lapack_int lwork, lapack_int* iwork, - lapack_int liwork ); + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, double* x, + lapack_int ldx, double* y, lapack_int ldy, + lapack_int nrnk, double* tol, lapack_int k, + double* reig, double *imeig, + double* z, lapack_int ldz, double* res, + double* b, lapack_int ldb, double* w, + lapack_int ldw, double* s, lapack_int lds, + double* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ); lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, lapack_complex_float* x, - lapack_int ldx, lapack_complex_float* y, - lapack_int ldy, lapack_int k, - lapack_complex_float* reig, - lapack_complex_float* imeig, + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, + lapack_complex_float* x, lapack_int ldx, + lapack_complex_float* y, lapack_int ldy, + lapack_int nrnk, float* tol, lapack_int k, + lapack_complex_float* eigs, lapack_complex_float* z, lapack_int ldz, - lapack_complex_float* res, + float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* w, lapack_int ldw, lapack_complex_float* s, lapack_int lds, - lapack_complex_float* work, lapack_int lwork, + lapack_complex_float* zwork, lapack_int lzwork, + float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ); lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, lapack_complex_double* x, - lapack_int ldx, lapack_complex_double* y, - lapack_int ldy, lapack_int k, - lapack_complex_double* reig, - lapack_complex_double* imeig, + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, + lapack_complex_double* x, lapack_int ldx, + lapack_complex_double* y, lapack_int ldy, + lapack_int nrnk, double* tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, lapack_int ldz, - lapack_complex_double* res, + double* res, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* w, lapack_int ldw, lapack_complex_double* s, lapack_int lds, - lapack_complex_double* work, lapack_int lwork, + lapack_complex_double* zwork, lapack_int lzwork, + double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ); lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, @@ -5769,8 +5773,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_int whtsvd, lapack_int m, lapack_int n, float* f, lapack_int ldf, float* x, lapack_int ldx, float* y, lapack_int ldy, - lapack_int nrnk, float tol, lapack_int k, - float* reig, float* imeig, float* z, + lapack_int nrnk, float* tol, lapack_int k, + float* reig, float *imeig, float* z, lapack_int ldz, float* res, float* b, lapack_int ldb, float* v, lapack_int ldv, float* s, lapack_int lds, float* work, @@ -5782,8 +5786,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_int whtsvd, lapack_int m, lapack_int n, double* f, lapack_int ldf, double* x, lapack_int ldx, double* y, lapack_int ldy, - lapack_int nrnk, double tol, lapack_int k, - double* reig, double* imeig, double* z, + lapack_int nrnk, double* tol, lapack_int k, + double* reig, double* imeig, double* z, lapack_int ldz, double* res, double* b, lapack_int ldb, double* v, lapack_int ldv, double* s, lapack_int lds, double* work, @@ -5796,17 +5800,16 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_complex_float* f, lapack_int ldf, lapack_complex_float* x, lapack_int ldx, lapack_complex_float* y, lapack_int ldy, - lapack_int nrnk, float tol, lapack_int k, - lapack_complex_float* reig, - lapack_complex_float* imeig, + lapack_int nrnk, float* tol, lapack_int k, + lapack_complex_float* eigs, lapack_complex_float* z, lapack_int ldz, - lapack_complex_float* res, + float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* v, lapack_int ldv, lapack_complex_float* s, lapack_int lds, - lapack_complex_float* work, lapack_int lwork, - lapack_int* iwork, - lapack_int liwork ); + lapack_complex_float* zwork, lapack_int lzwork, + float* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork); lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, char jobr, char jobq, char jobt, char jobf, @@ -5814,17 +5817,16 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_complex_double* f, lapack_int ldf, lapack_complex_double* x, lapack_int ldx, lapack_complex_double* y, lapack_int ldy, - lapack_int nrnk, double tol, lapack_int k, - lapack_complex_double* reig, - lapack_complex_double* imeig, + lapack_int nrnk, double* tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, lapack_int ldz, - lapack_complex_double* res, + double* res, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* v, lapack_int ldv, lapack_complex_double* s, lapack_int lds, - lapack_complex_double* work, lapack_int lwork, - lapack_int* iwork, - lapack_int liwork ); + lapack_complex_double* zwork, lapack_int lzwork, + double* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork); lapack_int LAPACKE_sgesv_work( int matrix_layout, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, lapack_int* ipiv, diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c index a269b0daf..6c77e199e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c @@ -32,22 +32,26 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobf, - lapack_int whtsvd, lapack_int m, lapack_int n, - lapack_complex_float* x, lapack_int ldx, - lapack_complex_float* y, lapack_int ldy, lapack_int k, - lapack_complex_float* reig, lapack_complex_float* imeig, +lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_float* x, + lapack_int ldx, lapack_complex_float* y, + lapack_int ldy, lapack_int nrnk, float* tol, + lapack_int k, lapack_complex_float* eigs, lapack_complex_float* z, lapack_int ldz, - lapack_complex_float* res, lapack_complex_float* b, + float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* w, lapack_int ldw, lapack_complex_float* s, lapack_int lds) { lapack_int info = 0; lapack_int lwork = -1; lapack_int liwork = -1; - lapack_complex_float* work = NULL; + lapack_int lzwork = -1; + lapack_complex_float* zwork = NULL; + float* work = NULL; lapack_int* iwork = NULL; - lapack_complex_float work_query; + lapack_complex_float zwork_query; + float work_query; lapack_int iwork_query; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgedmd", -1 ); @@ -77,36 +81,44 @@ lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobf, } #endif /* Query optimal working array(s) size */ - info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, &work_query, lwork, - &iwork_query, liwork ); + info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, &zwork_query, + lzwork, &work_query, lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; } + lzwork = LAPACK_C2INT( zwork_query ); lwork = LAPACK_C2INT( work_query ); liwork = iwork_query; /* Allocate memory for work arrays */ - work = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); - if( work == NULL ) { + zwork = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork ); + if( zwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); if( iwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_2; } /* Call middle-level interface */ - info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, work, lwork, iwork, - liwork ); + info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); -exit_level_1: +exit_level_2: LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_cgedmd", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c index 534934efb..08d8b91f5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c @@ -33,23 +33,25 @@ #include "lapacke_utils.h" lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, + char jobr, char jobf, lapack_int whtsvd, lapack_int m, lapack_int n, lapack_complex_float* x, lapack_int ldx, - lapack_complex_float* y, lapack_int ldy, lapack_int k, - lapack_complex_float* reig, lapack_complex_float* imeig, + lapack_complex_float* y, lapack_int ldy, lapack_int nrnk, + float* tol, lapack_int k, lapack_complex_float* eigs, lapack_complex_float* z, lapack_int ldz, - lapack_complex_float* res, lapack_complex_float* b, + float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* w, lapack_int ldw, lapack_complex_float* s, lapack_int lds, - lapack_complex_float* work, lapack_int lwork, + lapack_complex_float* zwork, lapack_int lzwork, + float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ) { lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y, + &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w, &ldw, + s, &lds, zwork, &lzwork, work, &lwork, iwork, &liwork, + &info ); if( info < 0 ) { info = info - 1; } @@ -99,9 +101,10 @@ lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, + &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, + &ldb, w, &ldw, s, &lds, zwork, &lzwork, + work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -143,9 +146,10 @@ lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, LAPACKE_cge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ - LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, - &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, - w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, + &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t, + res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork, + &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c index 60e83729b..b0b258f97 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c @@ -37,20 +37,22 @@ lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr, lapack_int m, lapack_int n, lapack_complex_float* f, lapack_int ldf, lapack_complex_float* x, lapack_int ldx, lapack_complex_float* y, - lapack_int ldy, lapack_int nrnk, float tol, - lapack_int k, lapack_complex_float* reig, - lapack_complex_float* imeig, + lapack_int ldy, lapack_int nrnk, float* tol, + lapack_int k, lapack_complex_float* eigs, lapack_complex_float* z, lapack_int ldz, - lapack_complex_float* res, lapack_complex_float* b, + float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* v, lapack_int ldv, lapack_complex_float* s, lapack_int lds) { lapack_int info = 0; lapack_int lwork = -1; lapack_int liwork = -1; - lapack_complex_float* work = NULL; + lapack_int lzwork = -1; + lapack_complex_float* zwork = NULL; + float* work = NULL; lapack_int* iwork = NULL; - lapack_complex_float work_query; + lapack_complex_float zwork_query; + float work_query; lapack_int iwork_query; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 ); @@ -85,36 +87,44 @@ lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr, /* Query optimal working array(s) size */ info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, - nrnk, tol, k, reig, imeig, z, ldz, res, - b, ldb, v, ldv, s, lds, &work_query, lwork, - &iwork_query, liwork ); + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, &zwork_query, lzwork, + &work_query, lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; } + lzwork = LAPACK_C2INT( zwork_query ); lwork = LAPACK_C2INT( work_query ); liwork = iwork_query; /* Allocate memory for work arrays */ - work = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); - if( work == NULL ) { + zwork = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork ); + if( zwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } + work = (float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); if( iwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_2; } /* Call middle-level interface */ info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, - nrnk, tol, k, reig, imeig, z, ldz, res, - b, ldb, v, ldv, s, lds, work, lwork, iwork, - liwork ); + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); -exit_level_1: +exit_level_2: LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_cgedmdq", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c index 5bdbd3f56..05287c1bc 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c @@ -38,15 +38,15 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_complex_float* f, lapack_int ldf, lapack_complex_float* x, lapack_int ldx, lapack_complex_float* y, lapack_int ldy, - lapack_int nrnk, float tol, lapack_int k, - lapack_complex_float* reig, - lapack_complex_float* imeig, + lapack_int nrnk, float* tol, lapack_int k, + lapack_complex_float* eigs, lapack_complex_float* z, - lapack_int ldz, lapack_complex_float* res, + lapack_int ldz, float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* v, lapack_int ldv, lapack_complex_float* s, - lapack_int lds, lapack_complex_float* work, + lapack_int lds, lapack_complex_float *zwork, + lapack_int lzwork, float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ) { @@ -54,9 +54,9 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } @@ -112,11 +112,11 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, return info; } /* Query optimal working array(s) size if requested */ - if( lwork == -1 || liwork == -1 ) { + if( lzwork == -1 || lwork == -1 || liwork == -1 ) { LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -165,9 +165,9 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c index 246d7f649..6802378da 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c @@ -32,11 +32,13 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf, - lapack_int whtsvd, lapack_int m, lapack_int n, - double* x, lapack_int ldx, double* y, lapack_int ldy, - lapack_int k, double* reig, double* imeig, double* z, - lapack_int ldz, double* res, double* b, lapack_int ldb, +lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, double* x, lapack_int ldx, double* y, + lapack_int ldy, lapack_int nrnk, double* tol, + lapack_int k, double* reig, double* imeig, + double* z, lapack_int ldz, + double* res, double* b, lapack_int ldb, double* w, lapack_int ldw, double* s, lapack_int lds) { lapack_int info = 0; @@ -74,10 +76,10 @@ lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf, } #endif /* Query optimal working array(s) size */ - info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, &work_query, lwork, - &iwork_query, liwork ); + info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, &work_query, + lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; @@ -96,10 +98,10 @@ lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf, goto exit_level_1; } /* Call middle-level interface */ - info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, work, lwork, iwork, - liwork ); + info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, work, lwork, + iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); exit_level_1: diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c index 4d1169de9..987709a1b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c @@ -33,22 +33,23 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, double* x, lapack_int ldx, - double* y, lapack_int ldy, lapack_int k, - double* reig, double* imeig, double* z, - lapack_int ldz, double* res, double* b, - lapack_int ldb, double* w, lapack_int ldw, - double* s, lapack_int lds, double* work, - lapack_int lwork, lapack_int* iwork, - lapack_int liwork ) + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, double* x, + lapack_int ldx, double* y, lapack_int ldy, + lapack_int nrnk, double* tol, lapack_int k, + double* reig, double* imeig, + double* z, lapack_int ldz, double* res, + double* b, lapack_int ldb, double* w, + lapack_int ldw, double* s, lapack_int lds, + double* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ) { lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y, + &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, + s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } @@ -98,9 +99,9 @@ lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, + y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, + &ldw, s, &lds, work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -142,9 +143,10 @@ lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, LAPACKE_dge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ - LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, - &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, - w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, + y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t, res, b_t, + &ldb_t, w_t, &ldw_t, s_t, &lds_t, work, &lwork, + iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c index f3d621ba9..5c3c39308 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c @@ -36,7 +36,7 @@ lapack_int LAPACKE_dgedmdq( int matrix_layout, char jobs, char jobz, char jobr, char jobq, char jobt, char jobf, lapack_int whtsvd, lapack_int m, lapack_int n, double* f, lapack_int ldf, double* x, lapack_int ldx, double* y, lapack_int ldy, - lapack_int nrnk, double tol, lapack_int k, + lapack_int nrnk, double* tol, lapack_int k, double* reig, double* imeig, double* z, lapack_int ldz, double* res, double* b, lapack_int ldb, double* v, lapack_int ldv, double* s, lapack_int lds) diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c index 51b2a66d8..149e6d24f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c @@ -37,7 +37,7 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_int whtsvd, lapack_int m, lapack_int n, double* f, lapack_int ldf, double* x, lapack_int ldx, double* y, lapack_int ldy, - lapack_int nrnk, double tol, lapack_int k, + lapack_int nrnk, double* tol, lapack_int k, double* reig, double* imeig, double* z, lapack_int ldz, double* res, double* b, lapack_int ldb, double* v, lapack_int ldv, @@ -49,8 +49,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; @@ -109,8 +109,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, /* Query optimal working array(s) size if requested */ if( lwork == -1 || liwork == -1 ) { LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } @@ -160,8 +160,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c index 879631b1d..6865fcf65 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c @@ -32,12 +32,14 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf, - lapack_int whtsvd, lapack_int m, lapack_int n, - float* x, lapack_int ldx, float* y, lapack_int ldy, - lapack_int k, float* reig, float* imeig, float* z, - lapack_int ldz, float* res, float* b, lapack_int ldb, - float* w, lapack_int ldw, float* s, lapack_int lds) +lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, float* x, lapack_int ldx, float* y, + lapack_int ldy, lapack_int nrnk, float* tol, + lapack_int k, float* reig, float* imeig, + float* z, lapack_int ldz, float* res, + float* b, lapack_int ldb, float* w, lapack_int ldw, + float* s, lapack_int lds) { lapack_int info = 0; lapack_int lwork = -1; @@ -74,10 +76,10 @@ lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf, } #endif /* Query optimal working array(s) size */ - info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, &work_query, lwork, - &iwork_query, liwork ); + info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, &work_query, + lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; @@ -96,10 +98,10 @@ lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf, goto exit_level_1; } /* Call middle-level interface */ - info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, work, lwork, iwork, - liwork ); + info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, work, lwork, + iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); exit_level_1: diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c index 762a9b271..5b24152da 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c @@ -33,22 +33,23 @@ #include "lapacke_utils.h" lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, float* x, lapack_int ldx, - float* y, lapack_int ldy, lapack_int k, - float* reig, float* imeig, float* z, - lapack_int ldz, float* res, float* b, - lapack_int ldb, float* w, lapack_int ldw, - float* s, lapack_int lds, float* work, - lapack_int lwork, lapack_int* iwork, - lapack_int liwork ) + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, float* x, + lapack_int ldx, float* y, lapack_int ldy, + lapack_int nrnk, float* tol, lapack_int k, + float* reig, float* imeig, + float* z, lapack_int ldz, float* res, + float* b, lapack_int ldb, float* w, + lapack_int ldw, float* s, lapack_int lds, + float* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ) { lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y, + &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, + s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } @@ -98,9 +99,10 @@ lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, + &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, + &ldb, w, &ldw, s, &lds, work, &lwork, iwork, + &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -142,9 +144,10 @@ lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, LAPACKE_sge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ - LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, - &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, - w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, + &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t, + res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, work, + &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c index e202d7fbd..e65c2094f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c @@ -36,7 +36,7 @@ lapack_int LAPACKE_sgedmdq( int matrix_layout, char jobs, char jobz, char jobr, char jobq, char jobt, char jobf, lapack_int whtsvd, lapack_int m, lapack_int n, float* f, lapack_int ldf, float* x, lapack_int ldx, float* y, lapack_int ldy, - lapack_int nrnk, float tol, lapack_int k, + lapack_int nrnk, float* tol, lapack_int k, float* reig, float* imeig, float* z, lapack_int ldz, float* res, float* b, lapack_int ldb, float* v, lapack_int ldv, float* s, lapack_int lds) diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c index 9039898d2..e1c1f5c98 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c @@ -37,7 +37,7 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_int whtsvd, lapack_int m, lapack_int n, float* f, lapack_int ldf, float* x, lapack_int ldx, float* y, lapack_int ldy, - lapack_int nrnk, float tol, lapack_int k, + lapack_int nrnk, float* tol, lapack_int k, float* reig, float* imeig, float* z, lapack_int ldz, float* res, float* b, lapack_int ldb, float* v, lapack_int ldv, @@ -49,8 +49,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; @@ -109,8 +109,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, /* Query optimal working array(s) size if requested */ if( lwork == -1 || liwork == -1 ) { LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } @@ -160,8 +160,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c index f3f421c54..e4ea4fe10 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c @@ -32,24 +32,28 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobf, - lapack_int whtsvd, lapack_int m, lapack_int n, - lapack_complex_double* x, lapack_int ldx, - lapack_complex_double* y, lapack_int ldy, - lapack_int k, lapack_complex_double* reig, - lapack_complex_double* imeig, lapack_complex_double* z, - lapack_int ldz, lapack_complex_double* res, +lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_double* x, + lapack_int ldx, lapack_complex_double* y, + lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, + lapack_int ldz, double* res, lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* zw, lapack_int lzw, lapack_complex_double* w, lapack_int ldw, lapack_complex_double* s, lapack_int lds) { lapack_int info = 0; lapack_int lwork = -1; lapack_int liwork = -1; - lapack_complex_double* work = NULL; + lapack_int lzwork = -1; + lapack_complex_double* zwork = NULL; + double* work = NULL; lapack_int* iwork = NULL; - lapack_complex_double work_query; + double work_query; lapack_int iwork_query; + lapack_complex_double zwork_query; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zgedmd", -1 ); return -1; @@ -78,36 +82,44 @@ lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobf, } #endif /* Query optimal working array(s) size */ - info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, &work_query, lwork, - &iwork_query, liwork ); + info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, &zwork_query, lzwork, + &work_query, lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; } lwork = LAPACK_Z2INT( work_query ); liwork = iwork_query; + lzwork = LAPACK_Z2INT( zwork_query ); /* Allocate memory for work arrays */ - work = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); - if( work == NULL ) { + zwork = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork ); + if( zwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); if( iwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_2; } /* Call middle-level interface */ - info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, work, lwork, iwork, - liwork ); + info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); -exit_level_1: +exit_level_2: LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_zgedmd", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c index 2554411ec..ebacfaa94 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c @@ -33,25 +33,27 @@ #include "lapacke_utils.h" lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, lapack_complex_double* x, + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, + lapack_complex_double* x, lapack_int ldx, lapack_complex_double* y, - lapack_int ldy, lapack_int k, - lapack_complex_double* reig, - lapack_complex_double* imeig, lapack_complex_double* z, - lapack_int ldz, lapack_complex_double* res, + lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, + lapack_int ldz, double* res, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* w, lapack_int ldw, lapack_complex_double* s, lapack_int lds, - lapack_complex_double* work, lapack_int lwork, + lapack_complex_double* zwork, lapack_int lzwork, + double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ) { lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, + y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w, + &ldw, s, &lds, zwork, &lzwork, work, &lwork, iwork, + &liwork, &info ); if( info < 0 ) { info = info - 1; } @@ -101,9 +103,10 @@ lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, + &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, + &ldb, w, &ldw, s, &lds, zwork, &lzwork, work, + &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -145,9 +148,10 @@ lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, LAPACKE_zge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ - LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, - &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, - w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, + &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t, + res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork, + &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c index 3648ffdf2..368d48e20 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c @@ -37,23 +37,25 @@ lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr, lapack_int m, lapack_int n, lapack_complex_double* f, lapack_int ldf, lapack_complex_double* x, lapack_int ldx, lapack_complex_double* y, - lapack_int ldy, lapack_int nrnk, double tol, - lapack_int k, lapack_complex_double* reig, - lapack_complex_double* imeig, + lapack_int ldy, lapack_int nrnk, double* tol, + lapack_int k, lapack_complex_double* eigs, lapack_complex_double* z, lapack_int ldz, - lapack_complex_double* res, lapack_complex_double* b, + double* res, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* v, lapack_int ldv, lapack_complex_double* s, lapack_int lds) { lapack_int info = 0; lapack_int lwork = -1; lapack_int liwork = -1; - lapack_complex_double* work = NULL; + lapack_int lzwork = -1; + lapack_complex_double* zwork = NULL; + double* work = NULL; lapack_int* iwork = NULL; - lapack_complex_double work_query; + double work_query; + lapack_complex_double zwork_query; lapack_int iwork_query; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { - LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 ); + LAPACKE_xerbla( "LAPACKE_zgedmdq", -1 ); return -1; } #ifndef LAPACK_DISABLE_NAN_CHECK @@ -85,36 +87,44 @@ lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr, /* Query optimal working array(s) size */ info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, - nrnk, tol, k, reig, imeig, z, ldz, res, - b, ldb, v, ldv, s, lds, &work_query, lwork, - &iwork_query, liwork ); + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, &zwork_query, lzwork, + &work_query, lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; } lwork = LAPACK_Z2INT( work_query ); + lzwork = LAPACK_Z2INT( zwork_query ); liwork = iwork_query; /* Allocate memory for work arrays */ - work = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); - if( work == NULL ) { + zwork = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork ); + if( zwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } + work = (double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); if( iwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_2; } /* Call middle-level interface */ info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, - nrnk, tol, k, reig, imeig, z, ldz, res, - b, ldb, v, ldv, s, lds, work, lwork, iwork, - liwork ); + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); -exit_level_1: +exit_level_2: LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_zgedmdq", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c index 9afceba07..131e4f9ad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c @@ -38,15 +38,15 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_complex_double* f, lapack_int ldf, lapack_complex_double* x, lapack_int ldx, lapack_complex_double* y, lapack_int ldy, - lapack_int nrnk, double tol, lapack_int k, - lapack_complex_double* reig, - lapack_complex_double* imeig, + lapack_int nrnk, double* tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, - lapack_int ldz, lapack_complex_double* res, + lapack_int ldz, double* res, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* v, lapack_int ldv, lapack_complex_double* s, - lapack_int lds, lapack_complex_double* work, + lapack_int lds, lapack_complex_double* zwork, + lapack_int lzwork, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ) { @@ -54,9 +54,9 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } @@ -114,9 +114,9 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, /* Query optimal working array(s) size if requested */ if( lwork == -1 || liwork == -1 ) { LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -165,9 +165,9 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } From 0a637cc403b2af09b20c48238e2376793a25c8c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Jul 2023 16:37:42 +0200 Subject: [PATCH 204/718] Fix workspace query corner cases to always return at least 1 (Reference-LAPACK PR 883) (#4146) * Fix workspace query corner cases to always return at least 1 --- lapack-netlib/SRC/csytrf.f | 2 +- lapack-netlib/SRC/csytrf_rk.f | 2 +- lapack-netlib/SRC/ctrevc3.f | 4 ++-- lapack-netlib/SRC/dsytrf.f | 2 +- lapack-netlib/SRC/dsytrf_rk.f | 2 +- lapack-netlib/SRC/dtrevc3.f | 2 +- lapack-netlib/SRC/ssytrf.f | 2 +- lapack-netlib/SRC/ssytrf_rk.f | 2 +- lapack-netlib/SRC/strevc3.f | 2 +- lapack-netlib/SRC/zsytrf.f | 2 +- lapack-netlib/SRC/zsytrf_rk.f | 2 +- lapack-netlib/SRC/ztrevc3.f | 4 ++-- 12 files changed, 14 insertions(+), 14 deletions(-) diff --git a/lapack-netlib/SRC/csytrf.f b/lapack-netlib/SRC/csytrf.f index ebf228f18..951196b83 100644 --- a/lapack-netlib/SRC/csytrf.f +++ b/lapack-netlib/SRC/csytrf.f @@ -232,7 +232,7 @@ * Determine the block size * NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/csytrf_rk.f b/lapack-netlib/SRC/csytrf_rk.f index 9c2b7182f..996801e7d 100644 --- a/lapack-netlib/SRC/csytrf_rk.f +++ b/lapack-netlib/SRC/csytrf_rk.f @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'CSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f index 0f58696b2..11b32104d 100644 --- a/lapack-netlib/SRC/ctrevc3.f +++ b/lapack-netlib/SRC/ctrevc3.f @@ -321,9 +321,9 @@ * INFO = 0 NB = ILAENV( 1, 'CTREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK - RWORK(1) = N + RWORK(1) = MAX( 1, N ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f index a39b03283..aee9b3f6a 100644 --- a/lapack-netlib/SRC/dsytrf.f +++ b/lapack-netlib/SRC/dsytrf.f @@ -232,7 +232,7 @@ * Determine the block size * NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f index 7341b9263..086586968 100644 --- a/lapack-netlib/SRC/dsytrf_rk.f +++ b/lapack-netlib/SRC/dsytrf_rk.f @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'DSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dtrevc3.f b/lapack-netlib/SRC/dtrevc3.f index a4651e788..c8c04ad13 100644 --- a/lapack-netlib/SRC/dtrevc3.f +++ b/lapack-netlib/SRC/dtrevc3.f @@ -298,7 +298,7 @@ * INFO = 0 NB = ILAENV( 1, 'DTREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f index d188589b9..31e38e466 100644 --- a/lapack-netlib/SRC/ssytrf.f +++ b/lapack-netlib/SRC/ssytrf.f @@ -232,7 +232,7 @@ * Determine the block size * NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f index ec84fcb1b..8e1ef460a 100644 --- a/lapack-netlib/SRC/ssytrf_rk.f +++ b/lapack-netlib/SRC/ssytrf_rk.f @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/strevc3.f b/lapack-netlib/SRC/strevc3.f index 5af57123b..253cbc24c 100644 --- a/lapack-netlib/SRC/strevc3.f +++ b/lapack-netlib/SRC/strevc3.f @@ -298,7 +298,7 @@ * INFO = 0 NB = ILAENV( 1, 'STREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN diff --git a/lapack-netlib/SRC/zsytrf.f b/lapack-netlib/SRC/zsytrf.f index a775a8758..dc9016c69 100644 --- a/lapack-netlib/SRC/zsytrf.f +++ b/lapack-netlib/SRC/zsytrf.f @@ -232,7 +232,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zsytrf_rk.f b/lapack-netlib/SRC/zsytrf_rk.f index 3b398ce6c..af8b8d501 100644 --- a/lapack-netlib/SRC/zsytrf_rk.f +++ b/lapack-netlib/SRC/zsytrf_rk.f @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/ztrevc3.f b/lapack-netlib/SRC/ztrevc3.f index 6300e80ae..8fb144e0c 100644 --- a/lapack-netlib/SRC/ztrevc3.f +++ b/lapack-netlib/SRC/ztrevc3.f @@ -321,9 +321,9 @@ * INFO = 0 NB = ILAENV( 1, 'ZTREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK - RWORK(1) = N + RWORK(1) = MAX( 1, N ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN INFO = -1 From 8da6aca2ecf557b6250631414a114bdb3f743c86 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Jul 2023 22:15:15 +0200 Subject: [PATCH 205/718] Support Alder Lake N (fam 6 exmodel 11 model 14) as Haswell --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index c2486e380..c485f3ddf 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1551,6 +1551,7 @@ int get_cpuname(void){ case 7: // Raptor Lake case 10: case 15: + case 14: // Alder Lake N if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) @@ -2360,6 +2361,7 @@ int get_coretype(void){ case 7: // Raptor Lake case 10: case 15: + case 14: // Alder Lake N #ifndef NO_AVX2 if(support_avx2()) return CORE_HASWELL; From ada9e442ebfe945895c444903f55b51b9daf8776 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jul 2023 23:13:56 +0200 Subject: [PATCH 206/718] Add Apple M1 build using gcc,gmake and OpenMP --- .cirrus.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 8a1c4a0a8..02cd40997 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -30,6 +30,15 @@ task: - cd build - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. - make + +task: + name: AppleM1/GCC/MAKE/OPENMP + compile_script: + - brew install gcc@11 + - export PATH=/opt/homebrew/bin:$PATH + - export LDFLAGS="-L/opt/homebrew/lib" + - export CPPFLAGS="-I/opt/homebrew/include" + - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 macos_instance: image: ghcr.io/cirruslabs/macos-monterey-xcode:latest From ca7199f249bb6a87f201a1cd564d42fef338f29a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Jul 2023 14:48:42 +0200 Subject: [PATCH 207/718] Treat newer Neoverse as N1 if SVE unavailable (may be disabled in container/cloud env) --- driver/others/dynamic_arm64.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 0f47b287c..b29e6e46c 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -147,6 +147,9 @@ extern void openblas_warning(int verbose, const char * msg); #ifndef HWCAP_CPUID #define HWCAP_CPUID (1 << 11) #endif +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) +#endif #define get_cpu_ftr(id, var) ({ \ __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ @@ -281,9 +284,15 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_NEOVERSEN1; #ifndef NO_SVE case 0xd49: - return &gotoblas_NEOVERSEN2; + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) + return &gotoblas_NEOVERSEN1; + else + return &gotoblas_NEOVERSEN2; case 0xd40: - return &gotoblas_NEOVERSEV1; + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) + return &gotoblas_NEOVERSEN1; + else + return &gotoblas_NEOVERSEV1; #endif case 0xd05: // Cortex A55 return &gotoblas_CORTEXA55; From 5c58994eb26bfd5715f4100a213a0e69b280ea16 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Jul 2023 18:27:41 +0200 Subject: [PATCH 208/718] Add fallback warning --- driver/others/dynamic_arm64.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index b29e6e46c..ef2597234 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -137,6 +137,8 @@ extern gotoblas_t gotoblas_CORTEXA55; #endif extern void openblas_warning(int verbose, const char * msg); +#define FALLBACK_VERBOSE 1 +#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" #define NUM_CORETYPES 13 @@ -284,14 +286,16 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_NEOVERSEN1; #ifndef NO_SVE case 0xd49: - if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); return &gotoblas_NEOVERSEN1; - else + } else return &gotoblas_NEOVERSEN2; case 0xd40: - if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); return &gotoblas_NEOVERSEN1; - else + }else return &gotoblas_NEOVERSEV1; #endif case 0xd05: // Cortex A55 From 66904f814853d1d34aed020eebbd826e8466f029 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Jul 2023 22:14:34 +0200 Subject: [PATCH 209/718] Ensure that a premature call will not overwrite unrelated memory --- driver/others/blas_server_win32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index afa33cccc..5bdfc1276 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads) blas_server_avail = 1; } - for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, From b34f19a365a43ce8a05c29a97036bfc9e300c289 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Jul 2023 22:19:22 +0200 Subject: [PATCH 210/718] Ensure that a premature call to set_num_threads will not overwrite unrelated memory --- driver/others/blas_server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 051513f27..a8a84acbb 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) { increased_threads = 1; - for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; From 76ef1672f84bb90574f451d4f4677c37aa6751b7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Jul 2023 22:31:07 +0200 Subject: [PATCH 211/718] Override DSDOT with generic code to get rid of qemu precision error --- kernel/riscv64/KERNEL.C910V | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 18cb3bafd..e6f2b3314 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -59,6 +59,7 @@ SDOTKERNEL = dot_vector.c DDOTKERNEL = dot_vector.c CDOTKERNEL = zdot_vector.c ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = ../generic/dot.c SNRM2KERNEL = nrm2_vector.c DNRM2KERNEL = nrm2_vector.c From 8df0289db61ea5a3e461c94c51a5798e2dd18b86 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Thu, 20 Jul 2023 15:28:35 +0100 Subject: [PATCH 212/718] Adds tail undisturbed for RVV Level 1 operations During the last iteration of some RVV operations, accumulators can get overwritten when VL < VLMAX and tail policy is agnostic. Commit changes intrinsics tail policy to undistrubed. --- kernel/riscv64/amax_rvv.c | 8 +++--- kernel/riscv64/amin_rvv.c | 8 +++--- kernel/riscv64/asum_rvv.c | 8 +++--- kernel/riscv64/dot_rvv.c | 16 +++++------ kernel/riscv64/iamax_rvv.c | 24 ++++++++-------- kernel/riscv64/iamin_rvv.c | 24 ++++++++-------- kernel/riscv64/imax_rvv.c | 24 ++++++++-------- kernel/riscv64/imin_rvv.c | 24 ++++++++-------- kernel/riscv64/izamax_rvv.c | 24 ++++++++-------- kernel/riscv64/izamin_rvv.c | 24 ++++++++-------- kernel/riscv64/max_rvv.c | 8 +++--- kernel/riscv64/min_rvv.c | 8 +++--- kernel/riscv64/nrm2_rvv.c | 8 +++--- kernel/riscv64/sum_rvv.c | 8 +++--- kernel/riscv64/zamax_rvv.c | 8 +++--- kernel/riscv64/zamin_rvv.c | 8 +++--- kernel/riscv64/zasum_rvv.c | 12 ++++---- kernel/riscv64/zdot_rvv.c | 56 ++++++++++++++++++------------------- kernel/riscv64/znrm2_rvv.c | 24 ++++++++-------- kernel/riscv64/zsum_rvv.c | 12 ++++---- 20 files changed, 168 insertions(+), 168 deletions(-) diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c index be0bdbea0..451fbc834 100644 --- a/kernel/riscv64/amax_rvv.c +++ b/kernel/riscv64/amax_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else @@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -78,7 +78,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLEV_FLOAT(x, vl); vx = VFABSV_FLOAT(vx, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } else { @@ -90,7 +90,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(x, stride_x, vl); vx = VFABSV_FLOAT(vx, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c index d4926084b..5186d7b12 100644 --- a/kernel/riscv64/amin_rvv.c +++ b/kernel/riscv64/amin_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else @@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -78,7 +78,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLEV_FLOAT(x, vl); vx = VFABSV_FLOAT(vx, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } else { @@ -90,7 +90,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(x, stride_x, vl); vx = VFABSV_FLOAT(vx, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c index 691591e22..0ea610cbb 100644 --- a/kernel/riscv64/asum_rvv.c +++ b/kernel/riscv64/asum_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -76,7 +76,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLEV_FLOAT(x, vl); vx = VFABSV_FLOAT(vx, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } else { @@ -88,7 +88,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(x, stride_x, vl); vx = VFABSV_FLOAT(vx, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c index 3276695b6..837badf41 100644 --- a/kernel/riscv64/dot_rvv.c +++ b/kernel/riscv64/dot_rvv.c @@ -49,12 +49,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } @@ -69,12 +69,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } } else if (1 == inc_y) { @@ -88,12 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } } else { @@ -108,12 +108,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); - vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); #else vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); - vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl); + vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); #endif } } diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c index d3508a91d..8362d7cef 100644 --- a/kernel/riscv64/iamax_rvv.c +++ b/kernel/riscv64/iamax_rvv.c @@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -68,12 +68,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -106,11 +106,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } else { @@ -125,11 +125,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c index ae1d4f726..f90dbb545 100644 --- a/kernel/riscv64/iamin_rvv.c +++ b/kernel/riscv64/iamin_rvv.c @@ -43,12 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -69,12 +69,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -107,11 +107,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } else { @@ -126,11 +126,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c index 33250568d..b1a77b178 100644 --- a/kernel/riscv64/imax_rvv.c +++ b/kernel/riscv64/imax_rvv.c @@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -67,12 +67,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,11 +104,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } else { @@ -122,11 +122,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); } } diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c index 4ce49c3af..1de7f3233 100644 --- a/kernel/riscv64/imin_rvv.c +++ b/kernel/riscv64/imin_rvv.c @@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint64m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu #define VIDV_UINT __riscv_vid_v_u64m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m8 #define VMVVX_UINT __riscv_vmv_v_x_u64m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -67,12 +67,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFIRSTM __riscv_vfirst_m_b4 #define UINT_V_T vuint32m8_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu #define VIDV_UINT __riscv_vid_v_u32m8 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m8 #define VMVVX_UINT __riscv_vmv_v_x_u32m8 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -104,11 +104,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } else { @@ -122,11 +122,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); } } diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index e43ded820..e93f0056c 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -44,13 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -73,13 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -116,11 +116,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); } } else { @@ -138,11 +138,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx0, vl); - v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl); - v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl); + v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); + v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); //update v_max and start_index j - v_max = VFMAXVV_FLOAT(v_max, vx0, vl); + v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); } } diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index cc3c37c8e..b5bc27404 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -43,13 +43,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFIRSTM __riscv_vfirst_m_b16 #define UINT_V_T vuint64m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u64m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu #define VIDV_UINT __riscv_vid_v_u64m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u64m4 #define VMVVX_UINT __riscv_vmv_v_x_u64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -70,13 +70,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFIRSTM __riscv_vfirst_m_b8 #define UINT_V_T vuint32m4_t -#define VIDV_MASK_UINT __riscv_vid_v_u32m4_mu +#define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu #define VIDV_UINT __riscv_vid_v_u32m4 -#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu +#define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu #define VADDVX_UINT __riscv_vadd_vx_u32m4 #define VMVVX_UINT __riscv_vmv_v_x_u32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -113,11 +113,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); } } else { @@ -136,11 +136,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) // index where element less than v_min mask = VMFLTVV_FLOAT(vx0, v_min, vl); - v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl); - v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl); + v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); + v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); //update v_min and start_index j - v_min = VFMINVV_FLOAT(v_min, vx0, vl); + v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); } } diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c index 9315321f4..745c27bf4 100644 --- a/kernel/riscv64/max_rvv.c +++ b/kernel/riscv64/max_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -75,7 +75,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLEV_FLOAT(x, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } else { @@ -86,7 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLSEV_FLOAT(x, stride_x, vl); - vmax = VFMAXVV_FLOAT(vmax, vx, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); } } diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c index 158b682fd..78528fef9 100644 --- a/kernel/riscv64/min_rvv.c +++ b/kernel/riscv64/min_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -75,7 +75,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLEV_FLOAT(x, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } else { @@ -86,7 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLSEV_FLOAT(x, stride_x, vl); - vmin = VFMINVV_FLOAT(vmin, vx, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); } } diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 42abfa119..994fadb70 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -79,7 +79,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLEV_FLOAT(x, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); } } else { @@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VLSEV_FLOAT(x, stride_x, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); } } diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c index 9715faf22..c5629197f 100644 --- a/kernel/riscv64/sum_rvv.c +++ b/kernel/riscv64/sum_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -73,7 +73,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLEV_FLOAT(x, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } else { @@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vl = VSETVL(n); vx = VLSEV_FLOAT(x, stride_x, vl); - vsum = VFADDVV_FLOAT(vsum, vx, vl); + vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); } } diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c index 615b7519c..bbb1e876b 100644 --- a/kernel/riscv64/zamax_rvv.c +++ b/kernel/riscv64/zamax_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4 +#define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmax = VFMAXVV_FLOAT(vmax, v0, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl); } @@ -101,7 +101,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmax = VFMAXVV_FLOAT(vmax, v0, vl); + vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl); } } diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c index a0d36d46f..c5453121b 100644 --- a/kernel/riscv64/zamin_rvv.c +++ b/kernel/riscv64/zamin_rvv.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m4 +#define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmin = VFMINVV_FLOAT(vmin, v0, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl); } } else { @@ -100,7 +100,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v1 = VFABSV_FLOAT(v1, vl); v0 = VFADDVV_FLOAT(v0, v1, vl); - vmin = VFMINVV_FLOAT(vmin, v0, vl); + vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl); } } diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c index 1d2f0e1fe..ebec1b19c 100644 --- a/kernel/riscv64/zasum_rvv.c +++ b/kernel/riscv64/zasum_rvv.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) @@ -51,7 +51,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 #endif @@ -75,8 +75,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } @@ -93,8 +93,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c index 1543c513d..fa0e89353 100644 --- a/kernel/riscv64/zdot_rvv.c +++ b/kernel/riscv64/zdot_rvv.c @@ -36,12 +36,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 #define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) @@ -52,12 +52,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 #define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #endif @@ -86,14 +86,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSEG_FLOAT(&vx0, &vx1, x, vl); VLSEG_FLOAT(&vy0, &vy1, y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } @@ -107,14 +107,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSEG_FLOAT(&vx0, &vx1, x, vl); VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } } else if (inc_y == 1){ @@ -127,14 +127,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); VLSEG_FLOAT(&vy0, &vy1, y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } }else { @@ -148,14 +148,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); - vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); #if !defined(CONJ) - vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl); #else - vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl); - vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl); + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl); + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl); #endif } } diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c index 5f7873b5a..d2b27aa8d 100644 --- a/kernel/riscv64/znrm2_rvv.c +++ b/kernel/riscv64/znrm2_rvv.c @@ -36,10 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #else @@ -51,10 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #endif @@ -85,11 +85,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); } } else { @@ -103,11 +103,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); - v_max = VFREDMAXVS_FLOAT(v0, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v0, v0, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - v_max = VFREDMAXVS_FLOAT(v1, v_max, vl); - vr = VFMACCVV_FLOAT(vr, v1, v1, vl); + v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); + vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); } } diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c index 44df112c6..b41f70eb5 100644 --- a/kernel/riscv64/zsum_rvv.c +++ b/kernel/riscv64/zsum_rvv.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m4_tu #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m4() @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 +#define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m4_tu #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -69,8 +69,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) VLSEG_FLOAT(&v0, &v1, x, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } else { @@ -82,8 +82,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); - v_sum = VFADDVV_FLOAT(v_sum, v0, vl); - v_sum = VFADDVV_FLOAT(v_sum, v1, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); + v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); } } From 25037ae87535c188a5635b471d566dc3a51e71d8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jul 2023 23:14:25 +0200 Subject: [PATCH 213/718] Fix actual arguments in some LAPACK procedure calls (Reference-LAPACK PR 885) (#4155) * Fix actual arguments (Reference-LAPACK PR 885) --- lapack-netlib/SRC/cgelqt3.f | 3 ++- lapack-netlib/SRC/dgelqt3.f | 3 ++- lapack-netlib/SRC/dlatrs.f | 7 ++++-- lapack-netlib/SRC/dtrsyl3.f | 2 +- lapack-netlib/SRC/sgelqt3.f | 3 ++- lapack-netlib/SRC/slatrs.f | 7 ++++-- lapack-netlib/SRC/strsyl3.f | 2 +- lapack-netlib/SRC/zgelqt3.f | 3 ++- lapack-netlib/TESTING/EIG/cerrst.f | 6 ++--- lapack-netlib/TESTING/EIG/derrst.f | 6 ++--- lapack-netlib/TESTING/EIG/serrst.f | 6 ++--- lapack-netlib/TESTING/EIG/zerrst.f | 6 ++--- lapack-netlib/TESTING/LIN/cchktr.f | 9 +++---- lapack-netlib/TESTING/LIN/cerrtr.f | 38 +++++++++++++++--------------- lapack-netlib/TESTING/LIN/derrtr.f | 38 +++++++++++++++--------------- lapack-netlib/TESTING/LIN/serrtr.f | 38 +++++++++++++++--------------- lapack-netlib/TESTING/LIN/zchktr.f | 7 +++--- lapack-netlib/TESTING/LIN/zerrtr.f | 38 +++++++++++++++--------------- 18 files changed, 117 insertions(+), 105 deletions(-) diff --git a/lapack-netlib/SRC/cgelqt3.f b/lapack-netlib/SRC/cgelqt3.f index 1dfbd3f2b..553087bf4 100644 --- a/lapack-netlib/SRC/cgelqt3.f +++ b/lapack-netlib/SRC/cgelqt3.f @@ -159,7 +159,8 @@ * * Compute Householder transform when M=1 * - CALL CLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL CLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1 ) ) T(1,1)=CONJG(T(1,1)) * ELSE diff --git a/lapack-netlib/SRC/dgelqt3.f b/lapack-netlib/SRC/dgelqt3.f index 5bcc06a80..ee3bdceb4 100644 --- a/lapack-netlib/SRC/dgelqt3.f +++ b/lapack-netlib/SRC/dgelqt3.f @@ -173,7 +173,8 @@ * * Compute Householder transform when M=1 * - CALL DLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL DLARFG( N, A ( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1) ) * ELSE * diff --git a/lapack-netlib/SRC/dlatrs.f b/lapack-netlib/SRC/dlatrs.f index be156bee2..b282f4227 100644 --- a/lapack-netlib/SRC/dlatrs.f +++ b/lapack-netlib/SRC/dlatrs.f @@ -261,6 +261,9 @@ DOUBLE PRECISION BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS, $ TMAX, TSCAL, USCAL, XBND, XJ, XMAX * .. +* .. Local Arrays .. + DOUBLE PRECISION WORK(1) +* .. * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX @@ -362,7 +365,7 @@ * A is upper triangular. * DO J = 2, N - TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ), $ TMAX ) END DO ELSE @@ -371,7 +374,7 @@ * DO J = 1, N - 1 TMAX = MAX( DLANGE( 'M', N-J, 1, A( J+1, J ), 1, - $ SUMJ ), TMAX ) + $ WORK ), TMAX ) END DO END IF * diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f index c44ec3808..31a5230ba 100644 --- a/lapack-netlib/SRC/dtrsyl3.f +++ b/lapack-netlib/SRC/dtrsyl3.f @@ -1220,7 +1220,7 @@ * SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) BUF = BUF * SCALOC - CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) ) END IF * * Combine with buffer scaling factor. SCALE will be flushed if diff --git a/lapack-netlib/SRC/sgelqt3.f b/lapack-netlib/SRC/sgelqt3.f index 23816b4c8..82f5c1cf4 100644 --- a/lapack-netlib/SRC/sgelqt3.f +++ b/lapack-netlib/SRC/sgelqt3.f @@ -158,7 +158,8 @@ * * Compute Householder transform when M=1 * - CALL SLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL SLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1 ) ) * ELSE * diff --git a/lapack-netlib/SRC/slatrs.f b/lapack-netlib/SRC/slatrs.f index 0761d656f..9765ea3d7 100644 --- a/lapack-netlib/SRC/slatrs.f +++ b/lapack-netlib/SRC/slatrs.f @@ -261,6 +261,9 @@ REAL BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS, $ TMAX, TSCAL, USCAL, XBND, XJ, XMAX * .. +* .. Local Arrays .. + REAL WORK (1) +* .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX @@ -362,7 +365,7 @@ * A is upper triangular. * DO J = 2, N - TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ), $ TMAX ) END DO ELSE @@ -371,7 +374,7 @@ * DO J = 1, N - 1 TMAX = MAX( SLANGE( 'M', N-J, 1, A( J+1, J ), 1, - $ SUMJ ), TMAX ) + $ WORK ), TMAX ) END DO END IF * diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f index 28762c2ed..ef3f2da83 100644 --- a/lapack-netlib/SRC/strsyl3.f +++ b/lapack-netlib/SRC/strsyl3.f @@ -1223,7 +1223,7 @@ * SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) BUF = BUF * SCALOC - CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) ) END IF * * Combine with buffer scaling factor. SCALE will be flushed if diff --git a/lapack-netlib/SRC/zgelqt3.f b/lapack-netlib/SRC/zgelqt3.f index 629a09472..1a71dc44e 100644 --- a/lapack-netlib/SRC/zgelqt3.f +++ b/lapack-netlib/SRC/zgelqt3.f @@ -174,7 +174,8 @@ * * Compute Householder transform when M=1 * - CALL ZLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL ZLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1 ) ) T(1,1)=CONJG(T(1,1)) * ELSE diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f index ba97afbe5..1748a2aad 100644 --- a/lapack-netlib/TESTING/EIG/cerrst.f +++ b/lapack-netlib/TESTING/EIG/cerrst.f @@ -160,13 +160,13 @@ * SRNAMT = 'CHETD2' INFOT = 1 - CALL CHETD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHETD2( '/', 0, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL CHETD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHETD2( 'U', -1, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CHETD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHETD2( 'U', 2, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) NT = NT + 3 * diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f index a55b6eea9..059538644 100644 --- a/lapack-netlib/TESTING/EIG/derrst.f +++ b/lapack-netlib/TESTING/EIG/derrst.f @@ -161,13 +161,13 @@ * SRNAMT = 'DSYTD2' INFOT = 1 - CALL DSYTD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL DSYTD2( '/', 0, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) NT = NT + 3 * diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f index 8c9c0f306..b87fc42ef 100644 --- a/lapack-netlib/TESTING/EIG/serrst.f +++ b/lapack-netlib/TESTING/EIG/serrst.f @@ -161,13 +161,13 @@ * SRNAMT = 'SSYTD2' INFOT = 1 - CALL SSYTD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL SSYTD2( '/', 0, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) NT = NT + 3 * diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f index 948f94bc2..d7b41c053 100644 --- a/lapack-netlib/TESTING/EIG/zerrst.f +++ b/lapack-netlib/TESTING/EIG/zerrst.f @@ -160,13 +160,13 @@ * SRNAMT = 'ZHETD2' INFOT = 1 - CALL ZHETD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL ZHETD2( '/', 0, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) NT = NT + 3 * diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f index 4b09361d8..2953a2bd5 100644 --- a/lapack-netlib/TESTING/LIN/cchktr.f +++ b/lapack-netlib/TESTING/LIN/cchktr.f @@ -201,7 +201,8 @@ * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ), SCALE3( 2 ) + REAL RESULT( NTESTS ), RWORK2( 2*NMAX ), + $ SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -542,10 +543,10 @@ SRNAMT = 'CLATRS3' CALL CCOPY( N, X, 1, B, 1 ) CALL CCOPY( N, X, 1, B( N+1 ), 1 ) - CALL CSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL CSSCAL( N, BIGNUM, B( N+1 ), 1 ) CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, - $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, - $ INFO ) + $ B, MAX(1, N), SCALE3, RWORK, RWORK2, + $ 2*NMAX, INFO ) * * Check error code from CLATRS3. * diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f index 9ba784f62..ab83357f8 100644 --- a/lapack-netlib/TESTING/LIN/cerrtr.f +++ b/lapack-netlib/TESTING/LIN/cerrtr.f @@ -70,7 +70,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - REAL RCOND, SCALE + REAL RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. REAL R1( NMAX ), R2( NMAX ), RW( NMAX ) @@ -245,40 +245,40 @@ * SRNAMT = 'CLATRS3' INFOT = 1 - CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 0, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 0, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) * * Test error exits for the packed triangular routines. diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f index d0580497d..878d9070c 100644 --- a/lapack-netlib/TESTING/LIN/derrtr.f +++ b/lapack-netlib/TESTING/LIN/derrtr.f @@ -71,7 +71,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - DOUBLE PRECISION RCOND, SCALE + DOUBLE PRECISION RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. INTEGER IW( NMAX ) @@ -250,40 +250,40 @@ * SRNAMT = 'DLATRS3' INFOT = 1 - CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 0, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 0, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f index af1ce0a8e..391b54c3f 100644 --- a/lapack-netlib/TESTING/LIN/serrtr.f +++ b/lapack-netlib/TESTING/LIN/serrtr.f @@ -71,7 +71,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - REAL RCOND, SCALE + REAL RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. INTEGER IW( NMAX ) @@ -250,40 +250,40 @@ * SRNAMT = 'SLATRS3' INFOT = 1 - CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 0, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 0, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f index 275ca2857..4af538124 100644 --- a/lapack-netlib/TESTING/LIN/zchktr.f +++ b/lapack-netlib/TESTING/LIN/zchktr.f @@ -201,7 +201,8 @@ * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) + DOUBLE PRECISION RESULT( NTESTS ), RWORK2( 2*NMAX), + $ SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -544,8 +545,8 @@ CALL ZCOPY( N, X, 1, B( N+1 ), 1 ) CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 ) CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, - $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, - $ INFO ) + $ B, MAX(1, N), SCALE3, RWORK, RWORK2, + $ 2*NMAX, INFO ) * * Check error code from ZLATRS3. * diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f index 211b92154..640c39793 100644 --- a/lapack-netlib/TESTING/LIN/zerrtr.f +++ b/lapack-netlib/TESTING/LIN/zerrtr.f @@ -70,7 +70,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - DOUBLE PRECISION RCOND, SCALE + DOUBLE PRECISION RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. DOUBLE PRECISION R1( NMAX ), R2( NMAX ), RW( NMAX ) @@ -245,40 +245,40 @@ * SRNAMT = 'ZLATRS3' INFOT = 1 - CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 0, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 0, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) * * Test error exits for the packed triangular routines. From fec4867748ef1c2141fe0c7aa3070f00d5048aab Mon Sep 17 00:00:00 2001 From: martin-frbg Date: Sun, 23 Jul 2023 20:31:55 +0200 Subject: [PATCH 214/718] Fix file permissions (issue 4095) --- benchmark/spr.c | 0 benchmark/spr2.c | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 benchmark/spr.c mode change 100755 => 100644 benchmark/spr2.c diff --git a/benchmark/spr.c b/benchmark/spr.c old mode 100755 new mode 100644 diff --git a/benchmark/spr2.c b/benchmark/spr2.c old mode 100755 new mode 100644 From 7976deff801b2dc7fa26278b075cf6d4bdbaf56e Mon Sep 17 00:00:00 2001 From: martin-frbg Date: Sun, 23 Jul 2023 20:37:07 +0200 Subject: [PATCH 215/718] Fix file permissions (issue 4095) --- kernel/arm64/sgemm_beta.S | 0 kernel/generic/ztrmmkernel_4x4.c | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 kernel/arm64/sgemm_beta.S mode change 100755 => 100644 kernel/generic/ztrmmkernel_4x4.c diff --git a/kernel/arm64/sgemm_beta.S b/kernel/arm64/sgemm_beta.S old mode 100755 new mode 100644 diff --git a/kernel/generic/ztrmmkernel_4x4.c b/kernel/generic/ztrmmkernel_4x4.c old mode 100755 new mode 100644 From 7c8ea130a35400ee15320b4747d52ff1c8349773 Mon Sep 17 00:00:00 2001 From: steppi Date: Tue, 18 Jul 2023 14:36:03 -0400 Subject: [PATCH 216/718] Set up cirun workflow for arm64 graviton --- .cirun.yml | 16 ++++ .github/workflows/arm64_graviton.yml | 126 +++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 .cirun.yml create mode 100644 .github/workflows/arm64_graviton.yml diff --git a/.cirun.yml b/.cirun.yml new file mode 100644 index 000000000..f0e0149d3 --- /dev/null +++ b/.cirun.yml @@ -0,0 +1,16 @@ +# Self-Hosted Github Action Runners on AWS via Cirun.io +# Reference: https://docs.cirun.io/Reference/yml.html +runners: + - name: "aws-runner-graviton" + # Cloud Provider: AWS + cloud: "aws" + region: "us-east-1" + # Cheapest VM on AWS + instance_type: "c7g.large" + # Ubuntu-22.04, ami image + machine_image: "ami-0a0c8eebcdd6dcbd0" + preemptible: false + # Add this label in the "runs-on" param in .github/workflows/.yml + # So that this runner is created for running the workflow + labels: + - "cirun-aws-runner-graviton" diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml new file mode 100644 index 000000000..bcb05047c --- /dev/null +++ b/.github/workflows/arm64_graviton.yml @@ -0,0 +1,126 @@ +name: arm64 graviton cirun + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + build: + runs-on: "cirun-aws-runner-graviton--${{ github.run_id }}" + + strategy: + fail-fast: false + matrix: + fortran: [gfortran] + build: [cmake, make] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt update + sudo apt-get install -y gfortran cmake ccache libtinfo5 + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + # GNU make and cmake call the compilers differently. It looks like + # that causes the cache to mismatch. Keep the ccache for both build + # tools separate to avoid polluting each other. + key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} + # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }} + ccache-${{ runner.os }}-${{ matrix.build }} + + - name: Configure ccache + run: | + if [ "${{ matrix.build }}" = "make" ]; then + # Add ccache to path + if [ "$RUNNER_OS" = "Linux" ]; then + echo "/usr/lib/ccache" >> $GITHUB_PATH + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + fi + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Build OpenBLAS + run: | + case "${{ matrix.build }}" in + "make") + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}" + ;; + "cmake") + mkdir build && cd build + cmake -DDYNAMIC_ARCH=1 \ + -DNOFORTRAN=0 \ + -DBUILD_WITHOUT_LAPACK=0 \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ + .. + cmake --build . + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac + + - name: Show ccache status + continue-on-error: true + run: ccache -s + + - name: Run tests + timeout-minutes: 60 + run: | + case "${{ matrix.build }}" in + "make") + MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0' + echo "::group::Tests in 'test' directory" + make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + echo "::group::Tests in 'ctest' directory" + make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + echo "::group::Tests in 'utest' directory" + make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + ;; + "cmake") + cd build && ctest + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac From 826a9d5fa47f20f23f42c97385e72e121a2efb4f Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Tue, 25 Jul 2023 11:36:23 +0100 Subject: [PATCH 219/718] Adds tail undisturbed for RVV Level 2 operations During the last iteration of some RVV operations, accumulators can get overwritten when VL < VLMAX and tail policy is agnostic. Commit changes intrinsics tail policy to undistrubed. --- kernel/riscv64/gemv_t_rvv.c | 8 ++++---- kernel/riscv64/symv_L_rvv.c | 12 +++++------ kernel/riscv64/symv_U_rvv.c | 12 +++++------ kernel/riscv64/zgemv_t_rvv.c | 40 ++++++++++++++++++------------------ 4 files changed, 36 insertions(+), 36 deletions(-) diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c index f0c834866..9c859aa50 100644 --- a/kernel/riscv64/gemv_t_rvv.c +++ b/kernel/riscv64/gemv_t_rvv.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 @@ -79,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO va = VLEV_FLOAT(a_ptr, vl); vx = VLEV_FLOAT(x_ptr, vl); - vr = VFMACCVV_FLOAT(vr, va, vx, vl); + vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); @@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO va = VLEV_FLOAT(a_ptr, vl); vx = VLSEV_FLOAT(x_ptr, stride_x, vl); - vr = VFMACCVV_FLOAT(vr, va, vx, vl); + vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c index b27db2e37..888d628a5 100644 --- a/kernel/riscv64/symv_L_rvv.c +++ b/kernel/riscv64/symv_L_rvv.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 @@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 @@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); iy += inc_yv; } @@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; } @@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; iy += inc_yv; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index 7e45b1a01..3cfd3ee4c 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 @@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m8 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 @@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); } v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLEV_FLOAT(&x[i], vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); iy += inc_yv; } @@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSEV_FLOAT(&y[i], vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; } @@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA VSSEV_FLOAT(&y[iy], stride_y, vy, vl); vx = VLSEV_FLOAT(&x[ix], stride_x, vl); - vr = VFMACCVV_FLOAT(vr, vx, va, vl); + vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); ix += inc_xv; iy += inc_yv; } diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c index 15795cc3a..2f0380530 100644 --- a/kernel/riscv64/zgemv_t_rvv.c +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 @@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 @@ -90,15 +90,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); #else - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); #endif j += vl * 2; ix += vl * inc_x * 2; @@ -134,15 +134,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl); #else - vr = VFMACCVV_FLOAT(vr, va0, vx0, vl); - vr = VFMACCVV_FLOAT(vr, va1, vx1, vl); - vi = VFMACCVV_FLOAT(vi, va0, vx1, vl); - vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); + vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl); + vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl); + vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl); #endif j += vl * 2; ix += vl * inc_x * 2; From aea2a4622b2cc3ccb5e3c58dc2e11b6c4471f6cd Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Tue, 25 Jul 2023 14:05:17 +0100 Subject: [PATCH 220/718] Use latest non-SVE kernels in ARMV8SVE These are generally better and, in some cases, include threading which helps in the cores we're targeting here. --- kernel/arm64/KERNEL.ARMV8SVE | 70 +++++++++++++++------------------- kernel/arm64/KERNEL.NEOVERSEV1 | 32 ---------------- 2 files changed, 30 insertions(+), 72 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 07393624c..38d7ff8f1 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S ZAMAXKERNEL = zamax.S SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S CAXPYKERNEL = zaxpy.S ZAXPYKERNEL = zaxpy.S @@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index d6617e8a4..a8dafe862 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,37 +1,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE -DAXPYKERNEL = daxpy_thunderx2t99.S - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c - CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c From f971ef55f2ce09d60b08137ca0608a6475a7611f Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Tue, 25 Jul 2023 11:56:33 +0100 Subject: [PATCH 221/718] Add ARMV8SVE to AArch64 Dynamic Dispatch In order to enable support for future cores which have similar tunings (in this case I'm doing this for the Arm(R) Neoverse(TM) V2 core), this generically detects SVE support and enables it. This should better manage the size and complexity of dynamic dispatch rather than just copy pasting the same parameters. To make `ARMV8SVE` more representive of the common 128-bit SVE case, I've split it and similar parameters from A64FX which has the wider 512-bit SVE. --- Makefile.system | 1 + cmake/arch.cmake | 2 +- driver/others/dynamic_arm64.c | 19 +++++++++++++++- param.h | 41 +++++++++++++++++++++++++++++++++-- 4 files changed, 59 insertions(+), 4 deletions(-) diff --git a/Makefile.system b/Makefile.system index 7d26eccc3..62926b380 100644 --- a/Makefile.system +++ b/Makefile.system @@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1 ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 +DYNAMIC_CORE += ARMV8SVE endif DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR diff --git a/cmake/arch.cmake b/cmake/arch.cmake index e6e434a0a..07df31b89 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index ef2597234..530d18115 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2; #else #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #endif +#ifdef DYN_ARMV8SVE +extern gotoblas_t gotoblas_ARMV8SVE; +#else +#define gotoblas_ARMV8SVE gotoblas_ARMV8 +#endif #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -128,9 +134,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #ifndef NO_SVE extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; +extern gotoblas_t gotoblas_ARMV8SVE; #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 +#define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; extern gotoblas_t gotoblas_CORTEXA55; @@ -140,7 +148,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 13 +#define NUM_CORETYPES 16 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -173,6 +181,7 @@ static char *corename[] = { "neoversen2", "thunderx3t110", "cortexa55", + "armv8sve", "unknown" }; @@ -192,6 +201,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; + if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; return corename[NUM_CORETYPES]; } @@ -226,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 12: return (&gotoblas_NEOVERSEN2); case 13: return (&gotoblas_THUNDERX3T110); case 14: return (&gotoblas_CORTEXA55); + case 15: return (&gotoblas_ARMV8SVE); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -345,6 +356,12 @@ static gotoblas_t *get_coretype(void) { snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); } +#ifndef NO_SVE + if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { + return &gotoblas_ARMV8SVE; + } +#endif + return NULL; #endif } diff --git a/param.h b/param.h index aa193a284..84e0c2ac7 100644 --- a/param.h +++ b/param.h @@ -3371,7 +3371,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(NEOVERSEV1) +#elif defined(NEOVERSEV1) // 256-bit SVE #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 @@ -3449,7 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) +#elif defined(A64FX) // 512-bit SVE /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ @@ -3490,6 +3490,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + +#define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16 From 24586bc4ffc0486b9f7051ca21a6a16e3c2f5797 Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Tue, 25 Jul 2023 14:50:38 +0100 Subject: [PATCH 222/718] Disambiguate whilelt --- kernel/arm64/cgemm_ncopy_sve_v1.c | 5 +++-- kernel/arm64/cgemm_tcopy_sve_v1.c | 5 +++-- kernel/arm64/symm_lcopy_sve.c | 9 +++++---- kernel/arm64/symm_ucopy_sve.c | 9 +++++---- kernel/arm64/trsm_ltcopy_sve.c | 9 +++++---- kernel/arm64/trsm_uncopy_sve.c | 9 +++++---- kernel/arm64/trsm_utcopy_sve.c | 9 +++++---- kernel/arm64/zgemm_ncopy_sve_v1.c | 5 +++-- kernel/arm64/zgemm_tcopy_sve_v1.c | 5 +++-- kernel/arm64/zhemm_ltcopy_sve.c | 13 +++++++------ kernel/arm64/zhemm_utcopy_sve.c | 13 +++++++------ kernel/arm64/zsymm_lcopy_sve.c | 9 +++++---- kernel/arm64/zsymm_ucopy_sve.c | 9 +++++---- kernel/arm64/ztrmm_lncopy_sve_v1.c | 9 +++++---- kernel/arm64/ztrmm_ltcopy_sve_v1.c | 9 +++++---- kernel/arm64/ztrmm_uncopy_sve_v1.c | 9 +++++---- kernel/arm64/ztrmm_utcopy_sve_v1.c | 9 +++++---- kernel/arm64/ztrsm_lncopy_sve.c | 9 +++++---- kernel/arm64/ztrsm_ltcopy_sve.c | 9 +++++---- kernel/arm64/ztrsm_uncopy_sve.c | 9 +++++---- kernel/arm64/ztrsm_utcopy_sve.c | 9 +++++---- 21 files changed, 101 insertions(+), 80 deletions(-) diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c index 6aa44a8f6..2fdaf5fcd 100644 --- a/kernel/arm64/cgemm_ncopy_sve_v1.c +++ b/kernel/arm64/cgemm_ncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b32(j, n); + svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); uint32_t active = svcntp_b32(svptrue_b32(), pg); do { @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * lda * 2; j += svcntw(); - pg = svwhilelt_b32(j, n); + pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); active = svcntp_b32(svptrue_b32(), pg); diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c index 748cd954e..086a2fed1 100644 --- a/kernel/arm64/cgemm_tcopy_sve_v1.c +++ b/kernel/arm64/cgemm_tcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b32(j, n); + svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); uint32_t active = svcntp_b32(svptrue_b32(), pg); do { @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * 2; j += svcntw(); - pg = svwhilelt_b32(j, n); + pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c index 6ba4afc8b..e138f0647 100644 --- a/kernel/arm64/symm_lcopy_sve.c +++ b/kernel/arm64/symm_lcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c index 32da5bd16..9a4cb6d4f 100644 --- a/kernel/arm64/symm_ucopy_sve.c +++ b/kernel/arm64/symm_ucopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c index ac4019e26..fdda992e0 100644 --- a/kernel/arm64/trsm_ltcopy_sve.c +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c index 8fdcd0f4b..1a03aa974 100644 --- a/kernel/arm64/trsm_uncopy_sve.c +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c index 0f5f0dccd..b06166f36 100644 --- a/kernel/arm64/trsm_utcopy_sve.c +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c index 8f9b4268a..6b8c93baf 100644 --- a/kernel/arm64/zgemm_ncopy_sve_v1.c +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); uint64_t active = svcntp_b64(svptrue_b64(), pg); do { @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * lda * 2; j += svcntd(); - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c index c6e50bc1c..fd8d2190f 100644 --- a/kernel/arm64/zgemm_tcopy_sve_v1.c +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); uint64_t active = svcntp_b64(svptrue_b64(), pg); do { @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * 2; j += svcntd(); - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c index 37dbfe4e1..615667264 100644 --- a/kernel/arm64/zhemm_ltcopy_sve.c +++ b/kernel/arm64/zhemm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); if (offset <= 0) { - svbool_t off_g = svwhilelt_b64(offset, 0LL); + svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t j = 0; int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); + svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c index 21e03b7be..f763d5607 100644 --- a/kernel/arm64/zhemm_utcopy_sve.c +++ b/kernel/arm64/zhemm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); data_vec_imag = svneg_z(pg, data_vec_imag); if (offset <= 0) { - svbool_t off_g = svwhilelt_b64(offset, 0LL); + svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); #else @@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t j = 0; int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); data_vec_imag = svneg_z(pg, data_vec_imag); if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); + svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c index 6f18aa956..5a17d3b19 100644 --- a/kernel/arm64/zsymm_lcopy_sve.c +++ b/kernel/arm64/zsymm_lcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c index 6be48cdaf..06989e3aa 100644 --- a/kernel/arm64/zsymm_ucopy_sve.c +++ b/kernel/arm64/zsymm_ucopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c index d34f607ab..5a7171d9d 100644 --- a/kernel/arm64/ztrmm_lncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c index 7f34c9857..3a88f26b2 100644 --- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c index 7eb9452c9..c3dbdcbe3 100644 --- a/kernel/arm64/ztrmm_uncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c index 60c8ff3b4..ddfa7ba4e 100644 --- a/kernel/arm64/ztrmm_utcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c index eb7cd0294..f81ba14c2 100644 --- a/kernel/arm64/ztrsm_lncopy_sve.c +++ b/kernel/arm64/ztrsm_lncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c index 34dbf8a30..46a11abed 100644 --- a/kernel/arm64/ztrsm_ltcopy_sve.c +++ b/kernel/arm64/ztrsm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c index 92e086b75..436112130 100644 --- a/kernel/arm64/ztrsm_uncopy_sve.c +++ b/kernel/arm64/ztrsm_uncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c index ccb942e1b..ddf3e265f 100644 --- a/kernel/arm64/ztrsm_utcopy_sve.c +++ b/kernel/arm64/ztrsm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif From ea669c8ae938450e101140d634cca291bffd3898 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Jul 2023 00:27:14 +0200 Subject: [PATCH 223/718] simplify openmp thread limit handling --- common_thread.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/common_thread.h b/common_thread.h index 05e1d5489..0a4c703b7 100644 --- a/common_thread.h +++ b/common_thread.h @@ -136,15 +136,13 @@ typedef struct blas_queue { #ifdef SMP_SERVER extern int blas_server_avail; +extern int blas_omp_number_max; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP int openmp_nthreads; - if (blas_num_threads_set == 0) openmp_nthreads=omp_get_max_threads(); - else - openmp_nthreads=blas_cpu_number; #endif #ifndef USE_OPENMP @@ -156,7 +154,13 @@ int openmp_nthreads; ) return 1; #ifdef USE_OPENMP - if (blas_cpu_number != openmp_nthreads) { + if (openmp_nthreads > blas_omp_number_max){ +#ifdef DEBUG + fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max); +#endif + openmp_nthreads = blas_omp_number_max; + } + if (blas_cpu_number != openmp_nthreads) { goto_set_num_threads(openmp_nthreads); } #endif From 3326b924b324d49960e0f657f049e5658e7e6c69 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Jul 2023 00:31:24 +0200 Subject: [PATCH 224/718] remove status variable blas_num_threads_set; initialize openmp thread maximum on startup --- driver/others/blas_server_omp.c | 4 +++- driver/others/memory.c | 4 ---- driver/others/memory_qalloc.c | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 2e0c0f38c..43764df00 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -68,6 +68,7 @@ #endif int blas_server_avail = 0; +int blas_omp_number_max = 0; extern int openblas_omp_adaptive_env(); @@ -100,7 +101,6 @@ static void adjust_thread_buffers() { void goto_set_num_threads(int num_threads) { - blas_num_threads_set = 1; if (num_threads < 0) blas_num_threads_set = 0; if (num_threads < 1) num_threads = blas_num_threads; @@ -125,6 +125,8 @@ void openblas_set_num_threads(int num_threads) { } int blas_thread_init(void){ +if(blas_omp_number_max <= 0) + blas_omp_number_max = omp_get_max_threads(); blas_get_cpu_number(); diff --git a/driver/others/memory.c b/driver/others/memory.c index 3cbd17bc2..4fceae754 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s */ int blas_num_threads = 0; -int blas_num_threads_set = 0; - int goto_get_num_procs (void) { return blas_cpu_number; } @@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s */ int blas_num_threads = 0; -int blas_num_threads_set = 0; - int goto_get_num_procs (void) { return blas_cpu_number; } diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c index 0b38d1887..6174d9b75 100644 --- a/driver/others/memory_qalloc.c +++ b/driver/others/memory_qalloc.c @@ -283,7 +283,6 @@ The numbers of threads in the thread pool. This value is equal or large than blas_cpu_number. This means some threads are sleep. */ int blas_num_threads = 0; -int blas_num_threads_set = 0; int goto_get_num_procs (void) { return blas_cpu_number; From 94adf98bb80b36c9faa2fa90fe7f33b36d30748c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Jul 2023 08:31:37 +0200 Subject: [PATCH 225/718] remove unused status variable --- common_thread.h | 1 - 1 file changed, 1 deletion(-) diff --git a/common_thread.h b/common_thread.h index 0a4c703b7..06a7a1a38 100644 --- a/common_thread.h +++ b/common_thread.h @@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads); /* Global Parameter */ extern int blas_cpu_number; extern int blas_num_threads; -extern int blas_num_threads_set; extern int blas_omp_linked; #define BLAS_LEGACY 0x8000U From 9ff84dc3f2536ce94e0a300a098485c7ff3d771d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Jul 2023 10:02:44 +0200 Subject: [PATCH 226/718] remove unused status variable --- driver/others/blas_server_omp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 43764df00..fe6b4a7c0 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -101,7 +101,6 @@ static void adjust_thread_buffers() { void goto_set_num_threads(int num_threads) { - if (num_threads < 0) blas_num_threads_set = 0; if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; From 76aa6bac4df3014acaad26390e6c7e3085d25806 Mon Sep 17 00:00:00 2001 From: steppi Date: Wed, 26 Jul 2023 12:01:12 -0400 Subject: [PATCH 227/718] Fix cirun url [skip actions] --- .cirun.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirun.yml b/.cirun.yml index f0e0149d3..bfc6494d0 100644 --- a/.cirun.yml +++ b/.cirun.yml @@ -1,5 +1,5 @@ # Self-Hosted Github Action Runners on AWS via Cirun.io -# Reference: https://docs.cirun.io/Reference/yml.html +# Reference: https://docs.cirun.io/reference/yaml runners: - name: "aws-runner-graviton" # Cloud Provider: AWS From 2a62d2df96beb60e2048543aca130eb267f94741 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Jul 2023 19:39:11 +0200 Subject: [PATCH 228/718] Enable use of AVX512 microkernels with NVIDIA HPC from version 22.3 --- kernel/x86_64/casum_microk_skylakex-2.c | 5 ++++- kernel/x86_64/cscal_microk_skylakex-2.c | 5 ++++- kernel/x86_64/dasum_microk_haswell-2.c | 5 ++++- kernel/x86_64/dasum_microk_skylakex-2.c | 5 ++++- kernel/x86_64/daxpy_microk_skylakex-2.c | 5 ++++- kernel/x86_64/ddot_microk_skylakex-2.c | 5 ++++- kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 5 ++++- kernel/x86_64/dgemm_small_kernel_tn_skylakex.c | 5 ++++- kernel/x86_64/dgemv_n_microk_skylakex-4.c | 5 ++++- kernel/x86_64/drot_microk_skylakex-2.c | 5 ++++- kernel/x86_64/dscal_microk_skylakex-2.c | 5 ++++- kernel/x86_64/dsymv_L_microk_skylakex-2.c | 5 ++++- kernel/x86_64/dtobf16_microk_cooperlake.c | 5 ++++- kernel/x86_64/sasum_microk_haswell-2.c | 5 ++++- kernel/x86_64/sasum_microk_skylakex-2.c | 5 ++++- kernel/x86_64/saxpy_microk_skylakex-2.c | 5 ++++- kernel/x86_64/sbdot_microk_cooperlake.c | 5 ++++- kernel/x86_64/sbgemv_n_microk_cooperlake.c | 5 ++++- kernel/x86_64/sbgemv_t_microk_cooperlake.c | 5 ++++- kernel/x86_64/sdot_microk_skylakex-2.c | 5 ++++- kernel/x86_64/sgemm_small_kernel_nn_skylakex.c | 6 +++++- kernel/x86_64/sgemm_small_kernel_tn_skylakex.c | 6 +++++- kernel/x86_64/sgemv_n_microk_skylakex-8.c | 8 ++++++-- kernel/x86_64/sgemv_t_microk_skylakex.c | 5 ++++- kernel/x86_64/srot_microk_skylakex-2.c | 5 ++++- kernel/x86_64/sscal_microk_skylakex-2.c | 6 +++++- kernel/x86_64/stobf16_microk_cooperlake.c | 6 +++++- kernel/x86_64/zasum_microk_skylakex-2.c | 6 +++++- kernel/x86_64/zscal_microk_skylakex-2.c | 6 +++++- 29 files changed, 124 insertions(+), 30 deletions(-) diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index b398aa6e1..ab0eea2ac 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_CASUM_KERNEL 1 diff --git a/kernel/x86_64/cscal_microk_skylakex-2.c b/kernel/x86_64/cscal_microk_skylakex-2.c index 8a622427b..a6c012a4c 100644 --- a/kernel/x86_64/cscal_microk_skylakex-2.c +++ b/kernel/x86_64/cscal_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index fd9da7ebe..bc27c7647 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -1,4 +1,7 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_DASUM_KERNEL diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c index 83bc078b3..76b9fbef0 100644 --- a/kernel/x86_64/dasum_microk_skylakex-2.c +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_DASUM_KERNEL 1 diff --git a/kernel/x86_64/daxpy_microk_skylakex-2.c b/kernel/x86_64/daxpy_microk_skylakex-2.c index e785a39f1..5b9147d10 100644 --- a/kernel/x86_64/daxpy_microk_skylakex-2.c +++ b/kernel/x86_64/daxpy_microk_skylakex-2.c @@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/ddot_microk_skylakex-2.c b/kernel/x86_64/ddot_microk_skylakex-2.c index 8eabf225a..f076862f7 100644 --- a/kernel/x86_64/ddot_microk_skylakex-2.c +++ b/kernel/x86_64/ddot_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_8 1 diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index a98772b94..da57a18a7 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include #include "common.h" diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index 37d1ca497..69ad6d94e 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include #include "common.h" diff --git a/kernel/x86_64/dgemv_n_microk_skylakex-4.c b/kernel/x86_64/dgemv_n_microk_skylakex-4.c index 4030399ab..4e8739864 100644 --- a/kernel/x86_64/dgemv_n_microk_skylakex-4.c +++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_4x4 1 diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c index 4e862e663..bf9c044d4 100644 --- a/kernel/x86_64/drot_microk_skylakex-2.c +++ b/kernel/x86_64/drot_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_DROT_KERNEL 1 diff --git a/kernel/x86_64/dscal_microk_skylakex-2.c b/kernel/x86_64/dscal_microk_skylakex-2.c index e0598272e..381136414 100644 --- a/kernel/x86_64/dscal_microk_skylakex-2.c +++ b/kernel/x86_64/dscal_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c index f0df5aaa8..ca4773a4b 100644 --- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/dtobf16_microk_cooperlake.c b/kernel/x86_64/dtobf16_microk_cooperlake.c index 9b8ac4714..b713b39be 100644 --- a/kernel/x86_64/dtobf16_microk_cooperlake.c +++ b/kernel/x86_64/dtobf16_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_TOBF16_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index 2eb5b9538..3b4d65cfc 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -1,4 +1,7 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SASUM_KERNEL 1 diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c index fbc91b558..f193053ee 100644 --- a/kernel/x86_64/sasum_microk_skylakex-2.c +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SASUM_KERNEL 1 diff --git a/kernel/x86_64/saxpy_microk_skylakex-2.c b/kernel/x86_64/saxpy_microk_skylakex-2.c index 950f10ba2..bbe4d2bc5 100644 --- a/kernel/x86_64/saxpy_microk_skylakex-2.c +++ b/kernel/x86_64/saxpy_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_16 1 diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c index 2aefe46ff..ccec98e34 100644 --- a/kernel/x86_64/sbdot_microk_cooperlake.c +++ b/kernel/x86_64/sbdot_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SBDOT_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake.c b/kernel/x86_64/sbgemv_n_microk_cooperlake.c index d875e0d96..c87f9fa5b 100644 --- a/kernel/x86_64/sbgemv_n_microk_cooperlake.c +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SBGEMV_N_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake.c b/kernel/x86_64/sbgemv_t_microk_cooperlake.c index 23da2e809..5b7a2e147 100644 --- a/kernel/x86_64/sbgemv_t_microk_cooperlake.c +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SBGEMV_T_ACCL_KERNEL 1 diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c index 1fcb7f27c..f14632f94 100644 --- a/kernel/x86_64/sdot_microk_skylakex-2.c +++ b/kernel/x86_64/sdot_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_16 1 diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index 2366fe3aa..6f4309c30 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include #include "common.h" diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c index 308f5e35e..987b090ba 100644 --- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include #include "common.h" diff --git a/kernel/x86_64/sgemv_n_microk_skylakex-8.c b/kernel/x86_64/sgemv_n_microk_skylakex-8.c index fba9cedcd..199621712 100644 --- a/kernel/x86_64/sgemv_n_microk_skylakex-8.c +++ b/kernel/x86_64/sgemv_n_microk_skylakex-8.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && __NVCOMPVERS >= 2203 ) + #define HAVE_SGEMV_N_SKYLAKE_KERNEL 1 #include "common.h" @@ -255,4 +259,4 @@ static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLAS } -#endif \ No newline at end of file +#endif diff --git a/kernel/x86_64/sgemv_t_microk_skylakex.c b/kernel/x86_64/sgemv_t_microk_skylakex.c index dca12acfc..d4f675a1e 100644 --- a/kernel/x86_64/sgemv_t_microk_skylakex.c +++ b/kernel/x86_64/sgemv_t_microk_skylakex.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SGEMV_T_SKYLAKE_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c index a21d1cf64..aec25ac56 100644 --- a/kernel/x86_64/srot_microk_skylakex-2.c +++ b/kernel/x86_64/srot_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SROT_KERNEL 1 diff --git a/kernel/x86_64/sscal_microk_skylakex-2.c b/kernel/x86_64/sscal_microk_skylakex-2.c index c4fa160f0..5c13cba55 100644 --- a/kernel/x86_64/sscal_microk_skylakex-2.c +++ b/kernel/x86_64/sscal_microk_skylakex-2.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include diff --git a/kernel/x86_64/stobf16_microk_cooperlake.c b/kernel/x86_64/stobf16_microk_cooperlake.c index 2756a6934..e7d20ddfa 100644 --- a/kernel/x86_64/stobf16_microk_cooperlake.c +++ b/kernel/x86_64/stobf16_microk_cooperlake.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #define HAVE_TOBF16_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index e257a5456..e60abc28b 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -1,5 +1,9 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #define HAVE_ZASUM_KERNEL 1 diff --git a/kernel/x86_64/zscal_microk_skylakex-2.c b/kernel/x86_64/zscal_microk_skylakex-2.c index f9e05e333..29dc4f6df 100644 --- a/kernel/x86_64/zscal_microk_skylakex-2.c +++ b/kernel/x86_64/zscal_microk_skylakex-2.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include From ee72575475739dac3fd64caaa31cd6f6fd006fe0 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 26 Jul 2023 13:00:07 +0200 Subject: [PATCH 229/718] Add documentation on redistributing OpenBLAS This touches on the following: - build configurations - naming of symbols, shared/static libraries and other build outputs like pkg-config and CMake files - (in more detail) guidance on ILP64 builds It tries to explain that, while this is only guidance and there may be reasons to deviate from that, for some build options there are best practices, and for some others there are choices to make. It also links to a number of well-maintained build recipes in order to help packagers of other distros make choices. Closes gh-3798 [skip ci] --- docs/distributing.md | 270 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 docs/distributing.md diff --git a/docs/distributing.md b/docs/distributing.md new file mode 100644 index 000000000..1e6372a28 --- /dev/null +++ b/docs/distributing.md @@ -0,0 +1,270 @@ +# Guidance for redistributing OpenBLAS + +*We note that this document contains recommendations only - packagers and other +redistributors are in charge of how OpenBLAS is built and distributed in their +systems, and may have good reasons to deviate from the guidance given on this +page. These recommendations are aimed at general packaging systems, with a user +base that typically is large, open source (or freely available at least), and +doesn't behave uniformly or that the packager is directly connected with.* + +OpenBLAS has a large number of build-time options which can be used to change +how it behaves at runtime, how artifacts or symbols are named, etc. Variation +in build configuration can be necessary to acheive a given end goal within a +distribution or as an end user. However, such variation can also make it more +difficult to build on top of OpenBLAS and ship code or other packages in a way +that works across many different distros. Here we provide guidance about the +most important build options, what effects they may have when changed, and +which ones to default to. + +The Make and CMake build systems provide equivalent options and yield more or +less the same artifacts, but not exactly (the CMake builds are still +experimental). You can choose either one and the options will function in the +same way, however the CMake outputs may require some renaming. To review +available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of +the repository. + +Build options typically fall into two categories: (a) options that affect the +user interface, such as library and symbol names or APIs that are made +available, and (b) options that affect performance and runtime behavior, such +as threading behavior or CPU architecture-specific code paths. The user +interface options are more important to keep aligned between distributions, +while for the performance-related options there are typically more reasons to +make choices that deviate from the defaults. + +Here are recommendations for user interface related packaging choices where it +is not likely to be a good idea to deviate (typically these are the default +settings): + +1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect + binary size much, so don't turn it off. +2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and + while it does make up a significant part of the binary size of the installed + library, that does not outweigh the regression in usability when deviating + from the default here.[^1] +3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency + detection files. These files are used by build systems when users want to + link against OpenBLAS, and there is no benefit of leaving them out. +4. Provide the LP64 interface by default, and if in addition to that you choose + to provide an ILP64 interface build as well, use a symbol suffix to avoid + symbol name clashes (see the next section). + +[^1] All major distributions do include LAPACK as of mid 2023 as far as we +know. Older versions of Arch Linux did not, and that was known to cause +problems. + + +## ILP64 interface builds + +The LP64 (32-bit integer) interface is the default build, and has +well-established C and Fortran APIs as determined by the reference (Netlib) +BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does +not have a standard API: symbol names and shared/static library names can be +produced in multiple ways, and this tends to make it difficult to use. +As of today there is an agreed-upon way of choosing names for OpenBLAS between +a number of key users/redistributors, which is the closest thing to a standard +that there is now. However, there is an ongoing standardization effort in the +reference BLAS and LAPACK libraries, which differs from the current OpenBLAS +agreed-upon convention. In this section we'll aim to explain both. + +Those two methods are fairly similar, and have a key thing in common: *using a +symbol suffix*. This is good practice; it is recommended that if you distribute +an ILP64 build, to have it use a symbol suffix containing `64` in the name. +This avoids potential symbol clashes when different packages which depend on +OpenBLAS load both an LP64 and an ILP64 library into memory at the same time. + +### The current OpenBLAS agreed-upon ILP64 convention + +This convention comprises the shared library name and the symbol suffix in the +shared library. The symbol suffix to use is `64_`, implying that the library +name will be `libopenblas64_.so` and the symbols in that library end in `64_`. +The central issue where this was discussed is +[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters +include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well. + +To build shared and static libraries with the currently recommended ILP64 +conventions with Make: +```bash +$ make INTERFACE64=1 SYMBOLSUFFIX=64_ +``` + +This will produce libraries named `libopenblas64_.so|a`, a pkg-config file +named `openblas64.pc`, and CMake and header files. + +Installing locally and inspecting the output will show a few more details: +```bash +$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_ +$ tree . # output slightly edited down +. +├── include +│   ├── cblas.h +│   ├── f77blas.h +│   ├── lapacke_config.h +│   ├── lapacke.h +│   ├── lapacke_mangling.h +│   ├── lapacke_utils.h +│   ├── lapack.h +│   └── openblas_config.h +└── lib + ├── cmake + │   └── openblas + │   ├── OpenBLASConfig.cmake + │   └── OpenBLASConfigVersion.cmake + ├── libopenblas64_.a + ├── libopenblas64_.so + └── pkgconfig + └── openblas64.pc +``` + +A key point are the symbol names. These will equal the LP64 symbol names, then +(for Fortran only) the compiler mangling, and then the `64_` symbol suffix. +Hence to obtain the final symbol names, we need to take into account which +Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel +Fortran, or Flang), that means appending a single underscore. In that case, the +result is: + +| base API name | binary symbol name | call from Fortran code | call from C code | +|---------------|--------------------|------------------------|-----------------------| +| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | +| `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` | + +It is quite useful to have these symbol names be as uniform as possible across +different packaging systems. + +The equivalent build options with CMake are: +```bash +$ mkdir build && cd build +$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON +$ cmake --build . -j +``` + +Note that the result is not 100% identical to the Make result. For example, the +library name ends in `_64` rather than `64_` - it is recommended to rename them +to match the Make library names (also update the `libsuffix` entry in +`openblas64.pc` to match that rename). +```bash +$ cmake --install . --prefix $PWD/../../openblas/cmake64 +$ tree . +. +├── include +│   └── openblas64 +│   ├── cblas.h +│   ├── f77blas.h +│   ├── lapacke_config.h +│   ├── lapacke_example_aux.h +│   ├── lapacke.h +│   ├── lapacke_mangling.h +│   ├── lapacke_utils.h +│   ├── lapack.h +│   ├── openblas64 +│   │   └── lapacke_mangling.h +│   └── openblas_config.h +└── lib + ├── cmake + │   └── OpenBLAS64 + │   ├── OpenBLAS64Config.cmake + │   ├── OpenBLAS64ConfigVersion.cmake + │   ├── OpenBLAS64Targets.cmake + │   └── OpenBLAS64Targets-noconfig.cmake + ├── libopenblas_64.a + ├── libopenblas_64.so -> libopenblas_64.so.0 + └── pkgconfig + └── openblas64.pc +``` + + +### The upcoming standardized ILP64 convention + +While the `64_` convention above got some adoption, it's slightly hacky and is +implemented through the use of `objcopy`. An effort is ongoing for a more +broadly adopted convention in the reference BLAS and LAPACK libraries, using +(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after +Fortran compiler mangling. The central issue for this is +[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666). + +For the most common cases of compiler mangling (a single `_` appended), the end +result will be: + +| base API name | binary symbol name | call from Fortran code | call from C code | +|---------------|--------------------|------------------------|-----------------------| +| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | +| `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` | + +For other compiler mangling schemes, replace the trailing `_` by the scheme in use. + +The shared library name for this `_64` convention should be `libopenblas_64.so`. + +Note: it is not yet possible to produce an OpenBLAS build which employs this +convention! Once reference BLAS and LAPACK with support for `_64` have been +released, a future OpenBLAS release will support it. For now, please use the +older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be +considered reserved for future use of the `_64` standard as prescribed by +reference BLAS/LAPACK. + + +## Performance and runtime behavior related build options + +For these options there are multiple reasonable or common choices. + +### Threading related options + +OpenBLAS can be built as a multi-threaded or single-threaded library, with the +default being multi-threaded. It's expected that the default `libopenblas` +library is multi-threaded; if you'd like to also distribute single-threaded +builds, consider naming them `libopenblas_sequential`. + +OpenBLAS can be built with pthreads or OpenMP as the threading model, with the +default being pthreads. Both options are commonly used, and the choice here +should not influence the shared library name. The choice will be captured by +the `.pc` file. E.g.,: +```bash +$ pkg-config --libs openblas +-fopenmp -lopenblas + +$ cat openblas.pc +... +openblas_config= ... USE_OPENMP=0 MAX_THREADS=24 +``` + +The maximum number of threads users will be able to use is determined at build +time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide +range of values that are reasonable to use (up to 256). 64 is a typical choice +here; there is a memory footprint penalty that is linear in `NUM_THREADS`. +Please see `Makefile.rule` for more details. + +### CPU architecture related options + +OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when +distributing to a user base with a variety of hardware, it is recommended to +enable CPU architecture runtime detection. This will dynamically select +optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1` +build option. This is usually done on all common CPU families, except when +there are known issues. + +In case the CPU architecture is known (e.g. you're building binaries for macOS +M1 users), it is possible to specify the target architecture directly with the +`TARGET=` build option. + +`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md` +in this repository. + + +## Real-world examples + +OpenBLAS is likely to be distributed in one of these distribution models: + +1. As a standalone package, or multiple packages, in a packaging ecosystem like + a Linux distro, Homebrew, conda-forge or MSYS2. +2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R. +3. Locally, e.g. making available as a build on a single HPC cluster. + +The guidance on this page is most important for models (1) and (2). These links +to build recipes for a representative selection of packaging systems may be +helpful as a reference: + +- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec) +- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules) +- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb) +- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD) +- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh) +- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh) +- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix) From 730ca04b48129a3810a5881b31d5fce4cdeb37ef Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Thu, 27 Jul 2023 13:27:28 +0100 Subject: [PATCH 230/718] Fix ZHEMM copy for SVE Whilst disambiguating whilelt, I inadvertantly used the wrong datatype for offsets, which can be negative. This rectifies that. --- kernel/arm64/zhemm_ltcopy_sve.c | 4 ++-- kernel/arm64/zhemm_utcopy_sve.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c index 615667264..fcf7e7073 100644 --- a/kernel/arm64/zhemm_ltcopy_sve.c +++ b/kernel/arm64/zhemm_ltcopy_sve.c @@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); if (offset <= 0) { - svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL); + svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); if (offset <= 0) { - svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0); + svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c index f763d5607..056c9824e 100644 --- a/kernel/arm64/zhemm_utcopy_sve.c +++ b/kernel/arm64/zhemm_utcopy_sve.c @@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); data_vec_imag = svneg_z(pg, data_vec_imag); if (offset <= 0) { - svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL); + svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); data_vec_imag = svneg_z(pg, data_vec_imag); if (offset <= 0) { - svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0); + svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } From 84a268b6ca208891177f7fbb82b87c0d561de3f3 Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Thu, 27 Jul 2023 10:55:34 +0100 Subject: [PATCH 231/718] Use SVE zgemm/cgemm on Arm(R) Neoverse(TM) V1 core This patch removes the prefetches from cgemm/zgemm which improves the performance similar to sgemm/dgemm did in #3868, this means I'm happy to enable this on any applicable cores. I also replicated the unrolling the copies from sgemm and dgemm. --- kernel/arm64/KERNEL.ARMV8SVE | 8 +- kernel/arm64/KERNEL.NEOVERSEV1 | 65 ----------- kernel/arm64/cgemm_kernel_sve_v1x4.S | 39 ------- kernel/arm64/gemm_ncopy_complex_sve_v1x4.c | 121 +++++++++++++++++++++ kernel/arm64/gemm_tcopy_complex_sve_v1x4.c | 115 ++++++++++++++++++++ kernel/arm64/zgemm_kernel_sve_v1x4.S | 41 ------- param.h | 6 +- 7 files changed, 244 insertions(+), 151 deletions(-) create mode 100644 kernel/arm64/gemm_ncopy_complex_sve_v1x4.c create mode 100644 kernel/arm64/gemm_tcopy_complex_sve_v1x4.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 38d7ff8f1..ccbce27e1 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -160,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c @@ -184,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index a8dafe862..bc5999097 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,66 +1 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRMMUNCOPY_M = -CTRMMLNCOPY_M = -CTRMMUTCOPY_M = -CTRMMLTCOPY_M = -CHEMMLTCOPY_M = -CHEMMUTCOPY_M = -CSYMMUCOPY_M = -CSYMMLCOPY_M = - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -else -CGEMMINCOPYOBJ = -CGEMMITCOPYOBJ = -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMCOPYLN_M = -ZTRSMCOPYLT_M = -ZTRSMCOPYUN_M = -ZTRSMCOPYUT_M = - -ZTRMMUNCOPY_M = -ZTRMMLNCOPY_M = -ZTRMMUTCOPY_M = -ZTRMMLTCOPY_M = -ZHEMMLTCOPY_M = -ZHEMMUTCOPY_M = -ZSYMMUCOPY_M = -ZSYMMLCOPY_M = - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -else -ZGEMMINCOPYOBJ = -ZGEMMITCOPYOBJ = -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S index 38770f66b..2136ebbee 100644 --- a/kernel/arm64/cgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S @@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, 32 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M1 @@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rw z15.s, p0/z, [pB, 28] add pB, pB, 32 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M2 @@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri z23.s, p1/m, z2.s, z15.s ld1rw z15.s, p0/z, [pB, 28] - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - add pB, pB, 32 - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] .endm .macro KERNELv1x4_E @@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ii z22.s, p1/m, z3.s, z15.s OP_ri z23.s, p1/m, z2.s, z15.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] - .endm .macro KERNELv1x4_SUB @@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ii z22.s, p1/m, z1.s, z15.s OP_ri z23.s, p1/m, z0.s, z15.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] .endm .macro SAVEv1x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2w {z26.s, z27.s}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #3 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld2w {z28.s, z29.s}, p1/z, [pCRow2] fmla z28.s, p1/m, z20.s, alphaz_R fmls z28.s, p1/m, z21.s, alphaz_I @@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z31.s, p1/m, z23.s, alphaz_R st2w {z30.s, z31.s}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2w {z26.s, z27.s}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #3 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - prfm PLDL1KEEP, [origPB] - prfm PLDL1KEEP, [origPA] - fmov alphaR, s0 dup alphaz_R, alphaR fmov alphaI, s1 @@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bne .Lcgemm_kernel_L4_Mv1_46 .Lcgemm_kernel_L4_Mv1_100: - prfm PLDL1KEEP, [pA] - prfm PLDL1KEEP, [pA, #64] - prfm PLDL1KEEP, [origPB] - SAVEv1x4 .Lcgemm_kernel_L4_Mv1_END: diff --git a/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c new file mode 100644 index 000000000..90f867b44 --- /dev/null +++ b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64_t +#define SV_INDEX svuint64_t +#define SV_INDEXER svindex_u64 +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32_t +#define SV_INDEX svuint32_t +#define SV_INDEXER svindex_u32 +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \ + a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \ + svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \ + a_offset_inner += 2; \ + b_offset += active * 2; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + uint64_t sve_size; + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2); + SV_TYPE a_vec_real; + SV_TYPE a_vec_imag; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * lda * 2; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + diff --git a/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c new file mode 100644 index 000000000..975166a2e --- /dev/null +++ b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64x2_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32x2_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld2(pg, a_offset_inner); \ + svst2(pg, b_offset, a_vec); \ + a_offset_inner += lda * 2; \ + b_offset += active * 2; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + uint64_t sve_size = svcntw(); + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_TYPE a_vec; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * 2; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + + diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S index d5b35775c..a043948d6 100644 --- a/kernel/arm64/zgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M1 @@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M2 @@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri z23.d, p1/m, z2.d, z15.d ld1rd z15.d, p0/z, [pB, 56] - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - add pB, pB, 64 - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] .endm .macro KERNELv1x4_E @@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ir z23.d, p1/m, z3.d, z14.d OP_ii z22.d, p1/m, z3.d, z15.d OP_ri z23.d, p1/m, z2.d, z15.d - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] - .endm .macro KERNELv1x4_SUB @@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ir z23.d, p1/m, z1.d, z14.d OP_ii z22.d, p1/m, z1.d, z15.d OP_ri z23.d, p1/m, z0.d, z15.d - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] .endm .macro SAVEv1x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2d {z26.d, z27.d}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #4 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2d {z28.d, z29.d}, p1/z, [pCRow2] fmla z28.d, p1/m, z20.d, alphaz_R @@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z31.d, p1/m, z23.d, alphaz_R st2d {z30.d, z31.d}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2d {z26.d, z27.d}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #4 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - prfm PLDL1KEEP, [origPB] - prfm PLDL1KEEP, [origPA] - fmov alphaR, d0 dup alphaz_R, alphaR fmov alphaI, d1 @@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bne .Lzgemm_kernel_L4_Mv1_46 .Lzgemm_kernel_L4_Mv1_100: - prfm PLDL1KEEP, [pA] - prfm PLDL1KEEP, [pA, #64] - prfm PLDL1KEEP, [origPB] - SAVEv1x4 .Lzgemm_kernel_L4_Mv1_END: diff --git a/param.h b/param.h index 84e0c2ac7..8fb4bcc48 100644 --- a/param.h +++ b/param.h @@ -3385,11 +3385,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define DGEMM_DEFAULT_UNROLL_M 4 // Actually 2VL (8) but kept separate to keep copies separate #define DGEMM_DEFAULT_UNROLL_N 8 -#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 -#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 160 From 007cd834c198d0df59f29893e72b26dbb99cd6fa Mon Sep 17 00:00:00 2001 From: Felix Yan Date: Fri, 28 Jul 2023 04:50:16 +0300 Subject: [PATCH 232/718] Use defined variable for riscv64 in arch.cmake It's defined in #4137 --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 07df31b89..ebdc5a833 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -135,7 +135,7 @@ if (ARM64) set(BINARY_DEFINED 1) endif () -if (${ARCH} STREQUAL "riscv64") +if (RISCV64) set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) endif () From 4ed6414c17a9d194e8f93e5a3f1796879c49d9a9 Mon Sep 17 00:00:00 2001 From: Felix Yan Date: Fri, 28 Jul 2023 04:53:27 +0300 Subject: [PATCH 233/718] Fix 64-bit fortran options for riscv64 64-bit builds are currently broken without this flag. Makefiles have done this already: https://github.com/xianyi/OpenBLAS/blob/5720fa02c58562c7d3e6a3e97b053598548e98d9/Makefile.system#L831 --- cmake/fc.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index a67760885..2f7ee3371 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -63,6 +63,11 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") endif () endif () + if (RISCV64) + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () + endif () else () if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") From f5506b002cac8226be51d9ccdc45cf37f94d1d42 Mon Sep 17 00:00:00 2001 From: Felix Yan Date: Fri, 28 Jul 2023 16:19:14 +0300 Subject: [PATCH 234/718] Add 64-bit flag on INTERFACE64 only --- cmake/fc.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 2f7ee3371..7400cb3e6 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -65,7 +65,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") endif () if (RISCV64) if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () endif () endif () else () From b209915121d7d5f38fdcfcd6029c2d14d2182ad0 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 25 Jul 2023 15:10:50 -0700 Subject: [PATCH 235/718] Fix build with clang There are two instances when building the tests where OpenBLAS fails to build with OpenMP and clang due to library paths getting reset as flags are set rather than appended. This seems to only affect certain clang/libomp installations, but if it's already grabbing the correct library paths we might as well use them. --- ctest/Makefile | 2 +- test/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index 0fb2450d2..9e85d23b9 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) -CEXTRALIB = -lomp +CEXTRALIB += -lomp endif endif ifeq ($(F_COMPILER), NAG) diff --git a/test/Makefile b/test/Makefile index 923f1537c..46a7b1158 100644 --- a/test/Makefile +++ b/test/Makefile @@ -265,7 +265,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) -CEXTRALIB = -lomp +CEXTRALIB += -lomp endif endif ifeq ($(F_COMPILER), NAG) From fe54ee3d15a1dd48cfebd227749e7e747e54c06e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 Jul 2023 11:48:38 +0200 Subject: [PATCH 236/718] nvc currently miscompiles this, hopefully fixed in release 23.09 --- kernel/x86_64/casum_microk_skylakex-2.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index ab0eea2ac..7161a0a62 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -2,7 +2,9 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309 ) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) #define HAVE_CASUM_KERNEL 1 @@ -350,3 +352,4 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) return sumf; } #endif +#endif From 9f6847583a5e0c5360760bd247c34bf7ce8db022 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 Jul 2023 11:50:16 +0200 Subject: [PATCH 237/718] nvc currently miscompiles this, hopefully fixed in release 23.09 --- kernel/x86_64/zasum_microk_skylakex-2.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index e60abc28b..94802e951 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -2,8 +2,9 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309 ) +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) #define HAVE_ZASUM_KERNEL 1 @@ -342,3 +343,4 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) return sumf; } #endif +#endif From ebb447e32ef95348bcea447b7b8db4cca37a2615 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 Jul 2023 12:23:57 +0200 Subject: [PATCH 238/718] Update zasum_microk_skylakex-2.c --- kernel/x86_64/zasum_microk_skylakex-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index 94802e951..d4f35db63 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -2,7 +2,7 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309 ) +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)) #if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) From 6a428b5629fdf09c896d7659e49796e844ba9848 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 Jul 2023 12:24:30 +0200 Subject: [PATCH 239/718] Update casum_microk_skylakex-2.c --- kernel/x86_64/casum_microk_skylakex-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index 7161a0a62..ac1dc6fa1 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -2,7 +2,7 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309 ) +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)) #if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) From 4664b57e6ede3c55efea2c6aba01dad59e84e67a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 4 Aug 2023 12:25:34 +0200 Subject: [PATCH 240/718] use shortcut only when both incx and incy are zero --- kernel/x86_64/zaxpy_sse2.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S index a7dd054fb..3776c8910 100644 --- a/kernel/x86_64/zaxpy_sse2.S +++ b/kernel/x86_64/zaxpy_sse2.S @@ -1418,10 +1418,10 @@ movq M, %rax //If incx==0 || incy==0, avoid unloop and jump to end. cmpq $0, INCX - je .L58 + jne .L59 cmpq $0, INCY je .L58 - +.L59: sarq $3, %rax jle .L55 From d64fa286f7c5c911c7a07a5638bc9090c54bafbc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 4 Aug 2023 12:26:36 +0200 Subject: [PATCH 241/718] add test case for zaxpy with incx=0 incy=1 --- utest/test_axpy.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/utest/test_axpy.c b/utest/test_axpy.c index 5fd7c1b04..33b6cf6ed 100644 --- a/utest/test_axpy.c +++ b/utest/test_axpy.c @@ -74,6 +74,28 @@ CTEST(axpy,zaxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } + +CTEST(axpy,zaxpy_incx_0) +{ + blasint i; + blasint N=4,incX=0,incY=1; + double a[2]={0.25,0.5}; + double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25}; + + //OpenBLAS + BLASFUNC(zaxpy)(&N,a,x1,&incX,y1,&incY); + + for(i=0; i<2*N; i++){ +//fprintf(stderr,"output X %lf\n",x1[i]); +//fprintf(stderr,"output Y %lf\n",y1[i]); + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + #endif #ifdef BUILD_SINGLE From 862d06ab8ae0486a41c138ed51c2cbc75722a906 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 4 Aug 2023 15:28:02 +0200 Subject: [PATCH 242/718] Add INCX=0,INCY=1 test case for CAXPY --- utest/test_axpy.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/utest/test_axpy.c b/utest/test_axpy.c index 33b6cf6ed..26005e70f 100644 --- a/utest/test_axpy.c +++ b/utest/test_axpy.c @@ -89,8 +89,6 @@ CTEST(axpy,zaxpy_incx_0) BLASFUNC(zaxpy)(&N,a,x1,&incX,y1,&incY); for(i=0; i<2*N; i++){ -//fprintf(stderr,"output X %lf\n",x1[i]); -//fprintf(stderr,"output Y %lf\n",y1[i]); ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } @@ -138,5 +136,24 @@ CTEST(axpy,caxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } + +CTEST(axpy,caxpy_incx_0) +{ + blasint i; + blasint N=4,incX=0,incY=1; + float a[2]={0.25,0.5}; + float x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + float y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25}; + + //OpenBLAS + BLASFUNC(caxpy)(&N,a,x1,&incX,y1,&incY); + + for(i=0; i<2*N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} #endif From f2c9ae9c332a5247efc79ede4df5656331a11f65 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 4 Aug 2023 15:31:03 +0200 Subject: [PATCH 243/718] Identify the new generation of flang that comes with LLVM17 --- f_check | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/f_check b/f_check index d071e016e..526c41dc6 100755 --- a/f_check +++ b/f_check @@ -101,7 +101,14 @@ else *flang*) vendor=FLANG openmp='-fopenmp' - ;; + data=`$compiler -v 2>&1 > /dev/null ` + v="${data#*version *}" + v="${v%%*.}" + major="${v%%.*}" + if [ "$major" -ge 17 ]; then + vendor=FLANGNEW + fi + ;; *ifort*|*ifx*) vendor=INTEL openmp='-fopenmp' From e8bc8a0ee7af5cf42c736253c300b0248bf9782c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 4 Aug 2023 15:32:19 +0200 Subject: [PATCH 244/718] Add support for the new generation flang that comes with LLVM17 --- Makefile.system | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 62926b380..71535b0cb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1087,8 +1087,9 @@ endif endif endif -ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) CCOMMON_OPT += -DF_INTERFACE_GFORT +ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive @@ -1102,6 +1103,7 @@ EXTRALIB += -lgfortran endif endif endif +endif ifdef NO_BINARY_MODE ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 From d46772e037e0eb8b5abe6e39b434105921306eed Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 30 Jun 2023 16:19:38 +0800 Subject: [PATCH 245/718] LoongArch64: Add compiler feature checks --- Makefile.system | 2 ++ c_check | 35 +++++++++++++++++++++ c_check.pl | 45 +++++++++++++++++++++++++++ kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 ++ param.h | 10 ++++-- 5 files changed, 92 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 71535b0cb..3c1648dc7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1770,6 +1770,8 @@ export TARGET_CORE export NO_AVX512 export NO_AVX2 export BUILD_BFLOAT16 +export NO_LSX +export NO_LASX export SBGEMM_UNROLL_M export SBGEMM_UNROLL_N diff --git a/c_check b/c_check index 7c8494e4a..7ee183163 100755 --- a/c_check +++ b/c_check @@ -185,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then rm -rf "$tmpd" fi +no_lsx=0 +no_lasx=0 +if [ "$architecture" = "loongarch64" ]; then + tmpd="$(mktemp -d)" + tmplsx="$tmpd/lsx.c" + codelsx='"vadd.b $vr0, $vr0, $vr0"' + lsx_flags='-march=loongarch64 -mlsx' + printf "#include \n\n" >> "$tmplsx" + printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" + args="$lsx_flags -o $tmplsx.o $tmplsx" + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_lsx=1 + } + + tmplasx="$tmpd/lasx.c" + codelasx='"xvadd.b $xr0, $xr0, $xr0"' + lasx_flags='-march=loongarch64 -mlasx' + printf "#include \n\n" >> "$tmplasx" + printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" + args="$lasx_flags -o $tmplasx.o $tmplasx" + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_lasx=1 + } + + rm -rf "$tmpd" +fi + case "$data" in *ARCH_X86_64*) architecture=x86_64 ;; *ARCH_X86*) architecture=x86 ;; @@ -399,6 +430,8 @@ done [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" + [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" + [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n" } >> "$makefile" os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` @@ -414,6 +447,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" + [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n" + [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n" } >> "$config" diff --git a/c_check.pl b/c_check.pl index 6ce28e11b..7a860a211 100644 --- a/c_check.pl +++ b/c_check.pl @@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } } +$no_lsx = 0; +$no_lasx = 0; +if (($architecture eq "loongarch64")) { + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility"; + } else { + $tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); + $codelsx = '"vadd.b $vr0, $vr0, $vr0"'; + $lsx_flags = "-march=loongarch64 -mlsx"; + print $tmplsx "#include \n\n"; + print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; + + $args = "$lsx_flags -o $tmplsx.o $tmplsx"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_lsx = 1; + } else { + $no_lsx = 0; + } + unlink("$tmplsx.o"); + + $tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); + $codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; + $lasx_flags = "-march=loongarch64 -mlasx"; + print $tmplasx "#include \n\n"; + print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; + + $args = "$lasx_flags -o $tmplasx.o $tmplasx"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_lasx = 1; + } else { + $no_lasx = 0; + } + unlink("$tmplasx.o"); + } +} + $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = e2k if ($data =~ /ARCH_E2K/); @@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1; print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; +print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1; +print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; @@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; +print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1; +print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1; if ($os eq "LINUX") { diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index cda359040..253aa2464 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -1,3 +1,4 @@ +ifndef NO_LASX DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S @@ -7,6 +8,7 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/param.h b/param.h index 8fb4bcc48..547463b2f 100644 --- a/param.h +++ b/param.h @@ -2845,15 +2845,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL -#define SGEMM_DEFAULT_UNROLL_N 8 +#if defined(NO_LASX) +#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 2 +#else #define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 16 +#endif + +#define SGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_M 16 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1 From db9a42f8c366c46dd914a1c6057afb87b2ca4e00 Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 30 Jun 2023 16:31:47 +0800 Subject: [PATCH 246/718] LoongArch64: using getauxval to do runtime check Using the getauxval instruction can prevent errors caused by hardware supporting vector instructions while the kernel does not support them --- cpuid_loongarch64.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index ca07c7ffb..7c389db27 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include +#include /* If LASX extension instructions supported, * using core LOONGSON3R5 @@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_LOONGSON3R5 1 #define CPU_LOONGSON2K1000 2 -#define LOONGARCH_CFG2 0x02 -#define LOONGARCH_LASX 1<<7 -#define LOONGARCH_LSX 1<<6 +#define LA_HWCAP_LSX (1<<4) +#define LA_HWCAP_LASX (1<<5) static char *cpuname[] = { "LOONGSONGENERIC", @@ -64,17 +64,11 @@ static char *cpuname_lower[] = { int detect(void) { #ifdef __linux - uint32_t reg = 0; + int flag = (int)getauxval(AT_HWCAP); - __asm__ volatile ( - "cpucfg %0, %1 \n\t" - : "+&r"(reg) - : "r"(LOONGARCH_CFG2) - ); - - if (reg & LOONGARCH_LASX) + if (flag & LA_HWCAP_LASX) return CPU_LOONGSON3R5; - else if (reg & LOONGARCH_LSX) + else if (flag & LA_HWCAP_LSX) return CPU_LOONGSON2K1000; else return CPU_GENERIC; From 96bf226bca7e2f0a6359f524280222e45df6bff1 Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 30 Jun 2023 17:34:08 +0800 Subject: [PATCH 247/718] gh-actions: Add loongarch64 CI --- .github/workflows/loongarch64.yml | 110 ++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 .github/workflows/loongarch64.yml diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml new file mode 100644 index 000000000..5501e98e0 --- /dev/null +++ b/.github/workflows/loongarch64.yml @@ -0,0 +1,110 @@ +name: loongarch64 qemu test + +on: [push, pull_request] + +jobs: + TEST: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - target: LOONGSONGENERIC + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 TARGET=LOONGSONGENERIC + - target: LOONGSON3R5 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 TARGET=LOONGSON3R5 + - target: LOONGSON2K1000 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 TARGET=LOONGSON2K1000 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Install APT deps + run: | + sudo add-apt-repository ppa:savoury1/virtualisation + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ + qemu-user-static + + - name: Download and install loongarch64-toolchain + run: | + wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz + tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt + + - name: Set env + run: | + echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Disable utest dsdot:dsdot_n_1 + run: | + echo -n > utest/test_dsdot.c + echo "Due to the qemu versions 7.2 causing utest cases to fail," + echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." + + - name: Build OpenBLAS + run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + + - name: Test + run: | + qemu-loongarch64-static ./utest/openblas_utest + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat From ec1e96aac851fd620a6a981b20e9dfdba6dd9493 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 11 Jul 2023 10:01:12 +0800 Subject: [PATCH 248/718] LoongArch64: Add dgemv_t_8_lasx.S and dgemv_n_8_lasx.S --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/KERNEL.generic | 4 + kernel/loongarch64/dgemv_n_8_lasx.S | 546 ++++++++++++++++++++++++++ kernel/loongarch64/dgemv_t_8_lasx.S | 468 ++++++++++++++++++++++ kernel/loongarch64/loongarch64_asm.S | 313 +++++++++++++++ 5 files changed, 1334 insertions(+) create mode 100644 kernel/loongarch64/dgemv_n_8_lasx.S create mode 100644 kernel/loongarch64/dgemv_t_8_lasx.S create mode 100644 kernel/loongarch64/loongarch64_asm.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index cda359040..19977c815 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -8,6 +8,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMVNKERNEL = dgemv_n_8_lasx.S +DGEMVTKERNEL = dgemv_t_8_lasx.S + DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic index b772a6f82..213add9ee 100644 --- a/kernel/loongarch64/KERNEL.generic +++ b/kernel/loongarch64/KERNEL.generic @@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c +ifndef DGEMVNKERNEL DGEMVNKERNEL = ../arm/gemv_n.c +endif CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c +ifndef DGEMVTKERNEL DGEMVTKERNEL = ../arm/gemv_t.c +endif CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S new file mode 100644 index 000000000..940d27569 --- /dev/null +++ b/kernel/loongarch64/dgemv_n_8_lasx.S @@ -0,0 +1,546 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/07/14 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define A0 $xr12 +#define A1 $xr13 +#define A2 $xr14 +#define A3 $xr15 +#define A4 $xr16 +#define A5 $xr17 +#define A6 $xr18 +#define A7 $xr19 +#define A8 $xr20 +#define A9 $xr21 +#define A10 $xr22 +#define A11 $xr23 +#define A12 $xr24 +#define A13 $xr25 +#define A14 $xr26 +#define A15 $xr27 + +.macro DLOAD_X_8 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ + X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro DLOAD_X_4 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro DLOAD_X_2 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro DLOAD_X_1 + GLDREPL xv, d, X0, X, 0x00 + GMUL xvf, d, X0, X0, VALPHA +.endm + +.macro DLOAD_Y_8 + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro DLOAD_Y_4 + GLD xv, , Y0, Y, 0 +.endm + +.macro DLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro DSTORE_Y_8 + GST xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro DSTORE_Y_4 + GST xv, , Y0, Y, 0 +.endm + +.macro DSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +// Unable to use vector load/store ins +.macro DLOAD_Y_8_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + PTR_ALSL T0, INC_Y, Y, 2 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + PTR_ADD T0, T0, INC_Y + PTR_ADD T0, T0, INC_Y + fld.d $f18, T0, 0 + fldx.d $f19, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 +.endm + +.macro DLOAD_Y_4_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 +.endm + +.macro DSTORE_Y_8_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 + + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 3 +.endm + +.macro DSTORE_Y_4_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 +.endm + +.macro DLOAD_X_8_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X4, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X5, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X6, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X7, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro DLOAD_X_4_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro DLOAD_X_2_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro DGEMV_N_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ + Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ + Y0, A6, X3, Y0, Y1, A7, X3, Y1, \ + Y0, A8, X4, Y0, Y1, A9, X4, Y1, \ + Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ + Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ + Y0, A14, X7, Y0, Y1, A15, X7, Y1 +.endm + +.macro DGEMV_N_4x8 + GLD_INC xv, , 0x20, A0, PA0, 0, \ + A2, PA1, 0, \ + A4, PA2, 0, \ + A6, PA3, 0, \ + A8, PA4, 0, \ + A10, PA5, 0, \ + A12, PA6, 0, \ + A14, PA7, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, \ + Y0, A2, X1, Y0, \ + Y0, A4, X2, Y0, \ + Y0, A6, X3, Y0, \ + Y0, A8, X4, Y0, \ + Y0, A10, X5, Y0, \ + Y0, A12, X6, Y0, \ + Y0, A14, X7, Y0 +.endm + +.macro DGEMV_N_1x8 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ + $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 + GMADD f, d, $f10, $f12, $f2, $f10, \ + $f10, $f14, $f3, $f10, \ + $f10, $f16, $f4, $f10, \ + $f10, $f18, $f5, $f10, \ + $f10, $f20, $f6, $f10, \ + $f10, $f22, $f7, $f10, \ + $f10, $f24, $f8, $f10, \ + $f10, $f26, $f9, $f10, +.endm + +.macro DGEMV_N_8x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ + Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ + Y0, A6, X3, Y0, Y1, A7, X3, Y1 +.endm + +.macro DGEMV_N_4x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \ + Y0, A4, X2, Y0, Y0, A6, X3, Y0 +.endm + +.macro DGEMV_N_1x4 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 + GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ + $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 +.endm + +.macro DGEMV_N_8x2 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0 + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1 +.endm + +.macro DGEMV_N_4x2 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 + GMADD xvf, d, Y0, A0, X0, Y0, \ + Y0, A2, X1, Y0 +.endm + +.macro DGEMV_N_1x2 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 + GMADD f, d, $f10, $f12, $f2, $f10, \ + $f10, $f14, $f3, $f10 +.endm + +.macro DGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + fmadd.d $f10, $f12, $f2, $f10 +.endm + +.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + DLOAD_\X_8 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x8 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 3 + PTR_ADDI K, K, 8 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_M_3 + DLOAD_\Y_4 + DGEMV_N_4x8 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x8 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + PTR_ALSL X, INC_X, X, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + DLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x4 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_4_M_3 + DLOAD_\Y_4 + DGEMV_N_4x4 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_N_4_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x4 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + DLOAD_\X_2 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x2 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_2_M_3 + DLOAD_\Y_4 + DGEMV_N_4x2 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_N_2_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x2 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD PA1, PA1, K_LDA + PTR_ALSL X, INC_X, X, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + DLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x1 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 24 + 4 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + xvreplve0.d VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 24 + 4 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S new file mode 100644 index 000000000..be90cb1af --- /dev/null +++ b/kernel/loongarch64/dgemv_t_8_lasx.S @@ -0,0 +1,468 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/07/17 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define X1 $xr2 +#define A0 $xr3 +#define A1 $xr4 +#define A2 $xr5 +#define A3 $xr6 +#define A4 $xr7 +#define A5 $xr8 +#define A6 $xr9 +#define A7 $xr10 +#define A8 $xr11 +#define A9 $xr12 +#define A10 $xr13 +#define A11 $xr14 +#define A12 $xr15 +#define A13 $xr16 +#define A14 $xr17 +#define A15 $xr18 +#define TP0 $xr19 +#define TP1 $xr20 +#define TP2 $xr21 +#define TP3 $xr22 +#define TP4 $xr23 +#define TP5 $xr24 +#define TP6 $xr25 +#define TP7 $xr26 +#define Y0 $xr3 +#define Y1 $xr4 +#define Y2 $xr5 +#define Y3 $xr6 +#define Y4 $xr7 +#define Y5 $xr8 +#define Y6 $xr9 +#define Y7 $xr10 + +.macro ZERO_Y8 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ + TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 +.endm + +.macro ZERO_Y4 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y2 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro DLOAD_X8 + GLD xv, , X0, X, 0x00, X1, X, 0x20 +.endm + +.macro DLOAD_X4 + GLD xv, , X0, X, 0x00 +.endm + +.macro DLOAD_X8_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 + PTR_ALSL T0, INC_X, X, 2 + fld.d $f2, T0, 0x00 + fldx.d $f3, T0, INC_X + PTR_ALSL T0, INC_X, T0, 1 + fld.d $f4, T0, 0x00 + fldx.d $f5, T0, INC_X + GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 +.endm + +.macro DLOAD_X4_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 +.endm + +.macro DGEMV_T_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ + TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ + TP3, A6, X0, TP3, TP3, A7, X1, TP3, \ + TP4, A8, X0, TP4, TP4, A9, X1, TP4, \ + TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ + TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ + TP7, A14, X0, TP7, TP7, A15, X1, TP7 +.endm + +.macro DGEMV_T_8x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \ + A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ + TP2, A4, X0, TP2, TP3, A6, X0, TP3, \ + TP4, A8, X0, TP4, TP5, A10, X0, TP5, \ + TP6, A12, X0, TP6, TP7, A14, X0, TP7, +.endm + +.macro DGEMV_T_4x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ + TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ + TP3, A6, X0, TP3, TP3, A7, X1, TP3 +.endm + +.macro DGEMV_T_4x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ + TP2, A4, X0, TP2, TP3, A6, X0, TP3 +.endm + +.macro DGEMV_T_2x8 + GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1 +.endm + +.macro DGEMV_T_2x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 +.endm + +.macro DGEMV_T XW:req X8:req, X4:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + ZERO_Y8 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + DLOAD_\X8 + DGEMV_T_8x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_M_3 + DLOAD_\X4 + DGEMV_T_8x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ + Y5, TP5, Y6, TP6, Y7, TP7 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 + fld.d $f15, PA4, 0x00 + fld.d $f16, PA5, 0x00 + fld.d $f17, PA6, 0x00 + fld.d $f18, PA7, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#else + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#endif + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ + $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + PTR_ALSL PY1, INC_Y, Y, 2 + fld.d $f15, PY1, 0x00 + fldx.d $f16, PY1, INC_Y + PTR_ALSL PY2, INC_Y, PY1, 1 + fld.d $f17, PY2, 0x00 + fldx.d $f18, PY2, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ + $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + fst.d $f15, PY1, 0x00 + fstx.d $f16, PY1, INC_Y + fst.d $f17, PY2, 0x00 + fstx.d $f18, PY2, INC_Y + PTR_ALSL Y, INC_Y, Y, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + DLOAD_\X8 + DGEMV_T_4x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_4_M_3 + DLOAD_\X4 + DGEMV_T_4x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_4_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + fld.d $f1, X, 0x00 + GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 + + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + ZERO_Y2 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + DLOAD_\X8 + DGEMV_T_2x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_2_M_3 + DLOAD_\X4 + DGEMV_T_2x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_2_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1 + andi I, M, 3 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + fld.d $f1, X, 0x00 + GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 + + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M8 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + PTR_ALSL Y, INC_Y, Y, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + fmadd.d $f19, $f3, $f1, $f19 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 + fld.d $f3, Y, 0x00 + fmadd.d $f3, ALPHA, $f19, $f3 + fst.d $f3, Y, 0x00 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 24 + 3 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + xvreplve0.d VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + DGEMV_T GAP_0, X8, X4 +.L_GAP_1: /* if (incx != 1) */ + DGEMV_T GAP_1, X8_GAP, X4_GAP +.L_END: + pop_if_used 17 + 8, 24 + 3 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S new file mode 100644 index 000000000..8876cbed9 --- /dev/null +++ b/kernel/loongarch64/loongarch64_asm.S @@ -0,0 +1,313 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#if __loongarch_grlen == 64 +#define LA_REG int64_t +#define REG_SIZE 8 +#define REG_LOG 3 +#define PTR_ADDI addi.d +#define PTR_ADD add.d +#define PTR_SUB sub.d +#define PTR_LD ld.d +#define PTR_ST st.d +#define PTR_SLLI slli.d +#define PTR_SRLI srli.d +#define PTR_ALSL alsl.d +#else +#define LA_REG int32_t +#define REG_SIZE 4 +#define REG_LOG 2 +#define PTR_ADDI addi.w +#define PTR_ADD add.w +#define PTR_SUB sub.w +#define PTR_LD ld.w +#define PTR_ST st.w +#define PTR_SLLI slli.w +#define PTR_SRLI srli.w +#define PTR_ALSL alsl.w +#endif + +#if __loongarch_frlen == 64 +#define FREG_SIZE 8 +#define FREG_LOG 3 +#define PTR_FLD fld.d +#define PTR_FST fst.d +#else +#define FREG_SIZE 4 +#define FREG_LOG 2 +#define PTR_FLD fld.s +#define PTR_FST fst.s +#endif + +// The max registers available to the user which +// do not need to be preserved across calls. +// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html +#define MAX_INT_CALLER_SAVED 17 +#define MAX_FP_CALLER_SAVED 24 + +.altmacro // Enable alternate macro mode + +.macro push_if_used regs, fregs +.if \regs > MAX_INT_CALLER_SAVED + PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) + push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 +.endif +.if \fregs > MAX_FP_CALLER_SAVED + PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) + push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 +.endif +.endm // End push_if_used +.macro pop_if_used regs, fregs +.if \fregs > MAX_FP_CALLER_SAVED + pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 + PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG +.endif +.if \regs > MAX_INT_CALLER_SAVED + pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 + PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG +.endif +.endm // End pop_if_used +.macro push_regs from, to + PTR_ST $s\()\from, $sp, \from << REG_LOG +.if \to - \from + push_regs %from + 1, \to +.endif +.endm // End push_regs +.macro pop_regs from, to + PTR_LD $s\()\from, $sp, \from << REG_LOG +.if \to - \from + pop_regs %from + 1, \to +.endif +.endm // End pop_regs +.macro push_fregs from, to + PTR_FST $fs\()\from, $sp, \from << FREG_LOG +.if \to - \from + push_fregs %from + 1, \to +.endif +.endm // End push_fregs +.macro pop_fregs from, to + PTR_FLD $fs\()\from, $sp, \from << FREG_LOG +.if \to - \from + pop_fregs %from + 1, \to +.endif +.endm // End pop_fregs + +// +// Instruction Related Macros +// +// GLD +// +.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ld \out, \src, \offset +.else + \pre_op\()ld.\suf_op \out, \src, \offset +.endif +.ifnb \more + GLD \pre_op, \suf_op, \more +.endif +.endm + +// +// GLD_INC +// +.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ld \out, \src, \offset +.else + \pre_op\()ld.\suf_op \out, \src, \offset +.endif + PTR_ADDI \src, \src, \inc +.ifnb \more + GLD_INC \pre_op, \suf_op, \inc, \more +.endif +.endm +// +// GLDX is same as GLD except the stride is a register +// +.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ldx \out, \src, \offset +.else + \pre_op\()ldx.\suf_op \out, \src, \offset +.endif +.ifnb \more + GLDX \pre_op, \suf_op, \more +.endif +.endm +// +// GLDREPL +// +.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg + \pre_op\()ldrepl.\suf_op \out, \src, \offset +.ifnb \more + GLDREPL \pre_op, \suf_op, \more +.endif +.endm +// +// GST +// +.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()st \src, \dst, \offset +.else + \pre_op\()st.\suf_op \src, \dst, \offset +.endif +.ifnb \more + GST \pre_op, \suf_op, \more +.endif +.endm +// +// GMUL +// +.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()mul.\suf_op \out, \in0, \in1 +.ifnb \more + GMUL \pre_op, \suf_op, \more +.endif +.endm +// +// GMADD +// +.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg + \pre_op\()madd.\suf_op \out, \in0, \in1, \in2 +.ifnb \more + GMADD \pre_op, \suf_op, \more +.endif +.endm +// +// GADD +// +.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()add.\suf_op \out, \in0, \in1 +.ifnb \more + GADD \pre_op, \suf_op, \more +.endif +.endm +// +// GADDI +// +.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()addi.\suf_op \out, \in0, \in1 +.ifnb \more + GADDI \pre_op, \suf_op, \more +.endif +.endm +// +// GSLLI +// +.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()slli.\suf_op \out, \in0, \in1 +.ifnb \more + GSLLI \pre_op, \suf_op, \more +.endif +.endm +// +// GINSVE0 +// +.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()insve0.\suf_op \out, \in0, \in1 +.ifnb \more + GINSVE0 \pre_op, \suf_op, \more +.endif +.endm +// +// GXOR +// +.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()xor.\suf_op \out, \in0, \in1 +.ifnb \more + GXOR \pre_op, \suf_op, \more +.endif +.endm + +// +// Compound instructions +// +// GACC: Accumulate the values of vector registers +// +.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifeqs "\suf_op", "s" + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "vf" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifeqs "\suf_op", "s" + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "xv" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "d" + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "w" + xvpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "h" + xvpackod.b \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif +.endif +.endif + +.ifeqs "\pre_op", "v" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "d" + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "w" + vpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "h" + vpackod.b \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif +.endif +.endif + +.ifnb \more + GACC \pre_op, \suf_op, \more +.endif +.endm From 57256623f4e2e1ba9303f924d9b2af82bdc45de4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Fri, 21 Apr 2023 10:20:54 +0200 Subject: [PATCH 249/718] fc.cmake: Add support for LLVM Flang. --- cmake/fc.cmake | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index a67760885..b70c788c1 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,7 +3,8 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. -if (${F_COMPILER} STREQUAL "FLANG") +if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + # This is for classic Flang. LLVM Flang is handled with gfortran below. set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (BINARY64 AND INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") @@ -38,15 +39,17 @@ if (${F_COMPILER} STREQUAL "G95") endif () endif () -if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") +if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") - # ensure reentrancy of lapack codes - set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") - # work around ABI violation in passing string arguments from C - set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") - #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc - if (NOT NO_LAPACK) - set(EXTRALIB "${EXTRALIB} -lgfortran") + if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + # ensure reentrancy of lapack codes + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") + # work around ABI violation in passing string arguments from C + set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") + if (NOT NO_LAPACK) + # Don't include -lgfortran, when NO_LAPACK=1 or lsbcc + set(EXTRALIB "${EXTRALIB} -lgfortran") + endif () endif () if (NO_BINARY_MODE) if (MIPS64) From f334bd9041f52528545dd6db63d9b435f129cbd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Fri, 21 Apr 2023 10:36:21 +0200 Subject: [PATCH 250/718] CI (MSYS2): Use LLVM Flang on CLANG64 runners. Add CLANG32 runner. --- .github/workflows/dynamic_arch.yml | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index c16f87954..1b45cb184 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -151,40 +151,47 @@ jobs: strategy: fail-fast: false matrix: - msystem: [MINGW64, MINGW32, CLANG64] + msystem: [MINGW64, MINGW32, CLANG64, CLANG32] idx: [int32, int64] build-type: [Release] include: - msystem: MINGW64 idx: int32 target-prefix: mingw-w64-x86_64 - fc-pkg: mingw-w64-x86_64-gcc-fortran + fc-pkg: fc - msystem: MINGW32 idx: int32 target-prefix: mingw-w64-i686 - fc-pkg: mingw-w64-i686-gcc-fortran + fc-pkg: fc - msystem: CLANG64 idx: int32 target-prefix: mingw-w64-clang-x86_64 + fc-pkg: fc + - msystem: CLANG32 + idx: int32 + target-prefix: mingw-w64-clang-i686 + fc-pkg: cc c-lapack-flags: -DC_LAPACK=ON - msystem: MINGW64 idx: int64 idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-x86_64 - fc-pkg: mingw-w64-x86_64-gcc-fortran + fc-pkg: fc - msystem: CLANG64 idx: int64 idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-clang-x86_64 - c-lapack-flags: -DC_LAPACK=ON + fc-pkg: fc - msystem: MINGW64 idx: int32 target-prefix: mingw-w64-x86_64 - fc-pkg: mingw-w64-x86_64-gcc-fortran + fc-pkg: fc build-type: None exclude: - msystem: MINGW32 idx: int64 + - msystem: CLANG32 + idx: int64 defaults: run: @@ -209,7 +216,7 @@ jobs: install: >- base-devel ${{ matrix.target-prefix }}-cc - ${{ matrix.fc-pkg }} + ${{ matrix.target-prefix }}-${{ matrix.fc-pkg }} ${{ matrix.target-prefix }}-cmake ${{ matrix.target-prefix }}-ninja ${{ matrix.target-prefix }}-ccache From 1c3fcaaf42e3b55eef34e53b3d999a981c793e3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Mon, 24 Apr 2023 18:32:03 +0200 Subject: [PATCH 251/718] CI (MSYS2): Re-run failed tests verbosely. --- .github/workflows/dynamic_arch.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 1b45cb184..85b20c060 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -287,9 +287,22 @@ jobs: key: ${{ steps.ccache-prepare.outputs.key }} - name: Run tests + id: run-ctest timeout-minutes: 60 run: cd build && ctest + - name: Re-run tests + if: always() && (steps.run-ctest.outcome == 'failure') + timeout-minutes: 60 + run: | + cd build + echo "::group::Re-run ctest" + ctest --rerun-failed --output-on-failure || true + echo "::endgroup::" + echo "::group::Log from these tests" + [ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log + echo "::endgroup::" + cross_build: runs-on: ubuntu-22.04 From 53378296c8f27b13c527700199412bc292507a7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Sat, 5 Aug 2023 13:47:38 +0200 Subject: [PATCH 252/718] CI: Build with NO_AVX512 for the runners that use Flang 16. --- .github/workflows/dynamic_arch.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 85b20c060..4fe6e63fc 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -167,6 +167,9 @@ jobs: idx: int32 target-prefix: mingw-w64-clang-x86_64 fc-pkg: fc + # Compiling with Flang 16 seems to cause test errors on machines + # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. + no-avx512-flags: -DNO_AVX512=1 - msystem: CLANG32 idx: int32 target-prefix: mingw-w64-clang-i686 @@ -182,6 +185,9 @@ jobs: idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-clang-x86_64 fc-pkg: fc + # Compiling with Flang 16 seems to cause test errors on machines + # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. + no-avx512-flags: -DNO_AVX512=1 - msystem: MINGW64 idx: int32 target-prefix: mingw-w64-x86_64 @@ -268,6 +274,7 @@ jobs: -DTARGET=CORE2 \ ${{ matrix.idx64-flags }} \ ${{ matrix.c-lapack-flags }} \ + ${{ matrix.no-avx512-flags }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ .. From df978c90cdbc51e24113ff4bffd6c438fa3db3b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Aug 2023 18:32:41 +0200 Subject: [PATCH 253/718] Update Jenkinsfile.pwr --- Jenkinsfile.pwr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile.pwr b/Jenkinsfile.pwr index d141ed8a5..96e18b8ad 100644 --- a/Jenkinsfile.pwr +++ b/Jenkinsfile.pwr @@ -9,7 +9,7 @@ pipeline { steps { sh 'sudo apt update' sh 'sudo apt install gfortran -y' - sh 'make' + sh 'make clean && make' } } } From 51c218d17a74c5722f76590022130b2faa3616ed Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Aug 2023 18:33:15 +0200 Subject: [PATCH 254/718] Update Jenkinsfile --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5fad6a95b..baeeee59f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -7,7 +7,7 @@ pipeline { stages { stage('Build') { steps { - sh 'make' + sh 'make clean && make' } } } From 61d803547afb6bb5889a2f00ac169e0e44d1febe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Aug 2023 15:17:38 +0200 Subject: [PATCH 255/718] Apply USE_TRMM to MIPS64_GENERIC as to GENERIC --- kernel/Makefile.L3 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index bea6cb048..174a1d41b 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -35,6 +35,12 @@ USE_TRMM = 1 endif endif +ifneq ($(DYNAMIC_ARCH), 1) +ifeq ($(TARGET), MIPS64_GENERIC) +USE_TRMM = 1 +endif +endif + ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif From 41c31bc1d4dbdaea326f41cbdf7f254826f8a25a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Aug 2023 16:00:03 +0200 Subject: [PATCH 256/718] Revert "LoongArch64: Add dgemv_t_8_lasx.S and dgemv_n_8_lasx.S" --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 - kernel/loongarch64/KERNEL.generic | 4 - kernel/loongarch64/dgemv_n_8_lasx.S | 546 -------------------------- kernel/loongarch64/dgemv_t_8_lasx.S | 468 ---------------------- kernel/loongarch64/loongarch64_asm.S | 313 --------------- 5 files changed, 1334 deletions(-) delete mode 100644 kernel/loongarch64/dgemv_n_8_lasx.S delete mode 100644 kernel/loongarch64/dgemv_t_8_lasx.S delete mode 100644 kernel/loongarch64/loongarch64_asm.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 79671c454..253aa2464 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -10,9 +10,6 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -DGEMVNKERNEL = dgemv_n_8_lasx.S -DGEMVTKERNEL = dgemv_t_8_lasx.S - DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic index 213add9ee..b772a6f82 100644 --- a/kernel/loongarch64/KERNEL.generic +++ b/kernel/loongarch64/KERNEL.generic @@ -132,16 +132,12 @@ CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c -ifndef DGEMVNKERNEL DGEMVNKERNEL = ../arm/gemv_n.c -endif CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c -ifndef DGEMVTKERNEL DGEMVTKERNEL = ../arm/gemv_t.c -endif CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S deleted file mode 100644 index 940d27569..000000000 --- a/kernel/loongarch64/dgemv_n_8_lasx.S +++ /dev/null @@ -1,546 +0,0 @@ -/******************************************************************************* -Copyright (c) 2023, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#define ASSEMBLER - -#include "common.h" -#include "loongarch64_asm.S" - -/********************************************************************* -* 2023/07/14 guxiwei -* UTEST : OK -* CTEST : OK -* TEST : OK -* -* -*********************************************************************/ - -/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, - * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) - */ -#define M $r4 -#define N $r5 -#define ALPHA $f0 -#define A $r7 -#define LDA $r8 -#define X $r9 -#define INC_X $r10 -#define Y $r11 -#define INC_Y $r6 - -#define J $r12 -#define I $r13 -#define K $r14 -#define Y_ORG $r15 -#define OFFSET $r16 -#define K_LDA $r17 -#define M8 $r18 -#define T0 $r19 -#define PA0 $r20 -#define PA1 $r23 -#define PA2 $r24 -#define PA3 $r25 -#define PA4 $r26 -#define PA5 $r27 -#define PA6 $r28 -#define PA7 $r29 - -#define VALPHA $xr1 -#define X0 $xr2 -#define X1 $xr3 -#define X2 $xr4 -#define X3 $xr5 -#define X4 $xr6 -#define X5 $xr7 -#define X6 $xr8 -#define X7 $xr9 -#define Y0 $xr10 -#define Y1 $xr11 -#define A0 $xr12 -#define A1 $xr13 -#define A2 $xr14 -#define A3 $xr15 -#define A4 $xr16 -#define A5 $xr17 -#define A6 $xr18 -#define A7 $xr19 -#define A8 $xr20 -#define A9 $xr21 -#define A10 $xr22 -#define A11 $xr23 -#define A12 $xr24 -#define A13 $xr25 -#define A14 $xr26 -#define A15 $xr27 - -.macro DLOAD_X_8 - GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ - X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 - GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ - X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA -.endm - -.macro DLOAD_X_4 - GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 - GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA -.endm - -.macro DLOAD_X_2 - GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 - GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA -.endm - -.macro DLOAD_X_1 - GLDREPL xv, d, X0, X, 0x00 - GMUL xvf, d, X0, X0, VALPHA -.endm - -.macro DLOAD_Y_8 - GLD xv, , Y0, Y, 0, Y1, Y, 0x20 -.endm - -.macro DLOAD_Y_4 - GLD xv, , Y0, Y, 0 -.endm - -.macro DLOAD_Y_1 - fld.d $f10, Y, 0 -.endm - -.macro DSTORE_Y_8 - GST xv, , Y0, Y, 0, Y1, Y, 0x20 -.endm - -.macro DSTORE_Y_4 - GST xv, , Y0, Y, 0 -.endm - -.macro DSTORE_Y_1 - fst.d $f10, Y, 0 -.endm - -// Unable to use vector load/store ins -.macro DLOAD_Y_8_GAP - fld.d $f10, Y, 0 - fldx.d $f13, Y, INC_Y - PTR_ALSL T0, INC_Y, Y, 1 - fld.d $f14, T0, 0 - fldx.d $f15, T0, INC_Y - PTR_ALSL T0, INC_Y, Y, 2 - fld.d $f11, T0, 0 - fldx.d $f17, T0, INC_Y - PTR_ADD T0, T0, INC_Y - PTR_ADD T0, T0, INC_Y - fld.d $f18, T0, 0 - fldx.d $f19, T0, INC_Y - GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 -.endm - -.macro DLOAD_Y_4_GAP - fld.d $f10, Y, 0 - fldx.d $f13, Y, INC_Y - PTR_ALSL T0, INC_Y, Y, 1 - fld.d $f14, T0, 0 - fldx.d $f15, T0, INC_Y - GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 -.endm - -.macro DSTORE_Y_8_GAP - xvstelm.d Y0, Y, 0, 0 - PTR_ADD T0, Y, INC_Y - xvstelm.d Y0, T0, 0, 1 - PTR_ADD T0, T0, INC_Y - xvstelm.d Y0, T0, 0, 2 - PTR_ADD T0, T0, INC_Y - xvstelm.d Y0, T0, 0, 3 - - PTR_ADD T0, T0, INC_Y - xvstelm.d Y1, T0, 0, 0 - PTR_ADD T0, T0, INC_Y - xvstelm.d Y1, T0, 0, 1 - PTR_ADD T0, T0, INC_Y - xvstelm.d Y1, T0, 0, 2 - PTR_ADD T0, T0, INC_Y - xvstelm.d Y1, T0, 0, 3 -.endm - -.macro DSTORE_Y_4_GAP - xvstelm.d Y0, Y, 0, 0 - PTR_ADD T0, Y, INC_Y - xvstelm.d Y0, T0, 0, 1 - PTR_ADD T0, T0, INC_Y - xvstelm.d Y0, T0, 0, 2 - PTR_ADD T0, T0, INC_Y - xvstelm.d Y0, T0, 0, 3 -.endm - -.macro DLOAD_X_8_GAP - xvldrepl.d X0, X, 0x00 - PTR_ADD T0, X, INC_X - xvldrepl.d X1, T0, 0x00 - PTR_ADD T0, T0, INC_X - xvldrepl.d X2, T0, 0x00 - PTR_ADD T0, T0, INC_X - xvldrepl.d X3, T0, 0x00 - PTR_ADD T0, T0, INC_X - xvldrepl.d X4, T0, 0x00 - PTR_ADD T0, T0, INC_X - xvldrepl.d X5, T0, 0x00 - PTR_ADD T0, T0, INC_X - xvldrepl.d X6, T0, 0x00 - PTR_ADD T0, T0, INC_X - xvldrepl.d X7, T0, 0x00 - GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ - X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA -.endm - -.macro DLOAD_X_4_GAP - xvldrepl.d X0, X, 0x00 - PTR_ADD T0, X, INC_X - xvldrepl.d X1, T0, 0x00 - PTR_ADD T0, T0, INC_X - xvldrepl.d X2, T0, 0x00 - PTR_ADD T0, T0, INC_X - xvldrepl.d X3, T0, 0x00 - GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA -.endm - -.macro DLOAD_X_2_GAP - xvldrepl.d X0, X, 0x00 - PTR_ADD T0, X, INC_X - xvldrepl.d X1, T0, 0x00 - GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA -.endm - -.macro DGEMV_N_8x8 - GLD_INC xv, , 0x20, \ - A0, PA0, 0, A1, PA0, 0, \ - A2, PA1, 0, A3, PA1, 0, \ - A4, PA2, 0, A5, PA2, 0, \ - A6, PA3, 0, A7, PA3, 0, \ - A8, PA4, 0, A9, PA4, 0, \ - A10, PA5, 0, A11, PA5, 0, \ - A12, PA6, 0, A13, PA6, 0, \ - A14, PA7, 0, A15, PA7, 0 - - GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ - Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ - Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ - Y0, A6, X3, Y0, Y1, A7, X3, Y1, \ - Y0, A8, X4, Y0, Y1, A9, X4, Y1, \ - Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ - Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ - Y0, A14, X7, Y0, Y1, A15, X7, Y1 -.endm - -.macro DGEMV_N_4x8 - GLD_INC xv, , 0x20, A0, PA0, 0, \ - A2, PA1, 0, \ - A4, PA2, 0, \ - A6, PA3, 0, \ - A8, PA4, 0, \ - A10, PA5, 0, \ - A12, PA6, 0, \ - A14, PA7, 0 - - GMADD xvf, d, Y0, A0, X0, Y0, \ - Y0, A2, X1, Y0, \ - Y0, A4, X2, Y0, \ - Y0, A6, X3, Y0, \ - Y0, A8, X4, Y0, \ - Y0, A10, X5, Y0, \ - Y0, A12, X6, Y0, \ - Y0, A14, X7, Y0 -.endm - -.macro DGEMV_N_1x8 - GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ - $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 - GMADD f, d, $f10, $f12, $f2, $f10, \ - $f10, $f14, $f3, $f10, \ - $f10, $f16, $f4, $f10, \ - $f10, $f18, $f5, $f10, \ - $f10, $f20, $f6, $f10, \ - $f10, $f22, $f7, $f10, \ - $f10, $f24, $f8, $f10, \ - $f10, $f26, $f9, $f10, -.endm - -.macro DGEMV_N_8x4 - GLD_INC xv, , 0x20, \ - A0, PA0, 0, A1, PA0, 0, \ - A2, PA1, 0, A3, PA1, 0, \ - A4, PA2, 0, A5, PA2, 0, \ - A6, PA3, 0, A7, PA3, 0 - - GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ - Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ - Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ - Y0, A6, X3, Y0, Y1, A7, X3, Y1 -.endm - -.macro DGEMV_N_4x4 - GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 - - GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \ - Y0, A4, X2, Y0, Y0, A6, X3, Y0 -.endm - -.macro DGEMV_N_1x4 - GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 - GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ - $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 -.endm - -.macro DGEMV_N_8x2 - GLD_INC xv, , 0x20, \ - A0, PA0, 0, A1, PA0, 0, \ - A2, PA1, 0, A3, PA1, 0 - GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ - Y0, A2, X1, Y0, Y1, A3, X1, Y1 -.endm - -.macro DGEMV_N_4x2 - GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 - GMADD xvf, d, Y0, A0, X0, Y0, \ - Y0, A2, X1, Y0 -.endm - -.macro DGEMV_N_1x2 - GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 - GMADD f, d, $f10, $f12, $f2, $f10, \ - $f10, $f14, $f3, $f10 -.endm - -.macro DGEMV_N_1x1 - fld.d $f12, PA0, 0 - PTR_ADDI PA0, PA0, 0x08 - fmadd.d $f10, $f12, $f2, $f10 -.endm - -.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req - PTR_SRLI J, N, 3 - beqz J, .L_\XW\()_N_7 - PTR_SLLI K_LDA, LDA, 3 - PTR_SUB K_LDA, K_LDA, M8 -.L_\XW\()_N_L8: - DLOAD_\X_8 - xor K, K, K - move Y, Y_ORG - PTR_SRLI I, M, 3 - beqz I, .L_\XW\()_M_7 -.align 5 -.L_\XW\()_M_L8: - DLOAD_\Y_8 - DGEMV_N_8x8 - DSTORE_\Y_8 - PTR_ADDI I, I, -1 - PTR_ALSL Y, INC_Y, Y, 3 - PTR_ADDI K, K, 8 - bnez I, .L_\XW\()_M_L8 -.L_\XW\()_M_7: - andi I, M, 4 - beqz I, .L_\XW\()_M_3 - DLOAD_\Y_4 - DGEMV_N_4x8 - DSTORE_\Y_4 - PTR_ALSL Y, INC_Y, Y, 2 - PTR_ADDI K, K, 4 -.L_\XW\()_M_3: - andi I, M, 3 - beqz I, .L_\XW\()_M_END -.align 5 -.L_\XW\()_M_L1: - DLOAD_\Y_1 - DGEMV_N_1x8 - DSTORE_\Y_1 - PTR_ADDI I, I, -1 - PTR_ADD Y, Y, INC_Y - PTR_ADDI K, K, 1 - bnez I, .L_\XW\()_M_L1 -.L_\XW\()_M_END: - PTR_ADDI J, J, -1 -#if __loongarch_grlen == 64 - GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ - PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -#else - GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ - PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -#endif - PTR_ALSL X, INC_X, X, 3 - bnez J, .L_\XW\()_N_L8 -.L_\XW\()_N_7: - andi J, N, 4 - beqz J, .L_\XW\()_N_3 - DLOAD_\X_4 - xor K, K, K - move Y, Y_ORG - - PTR_SRLI I, M, 3 - beqz I, .L_\XW\()_N_4_M_7 -.align 5 -.L_\XW\()_N_4_M_L8: - DLOAD_\Y_8 - DGEMV_N_8x4 - DSTORE_\Y_8 - PTR_ADDI I, I, -1 - PTR_ADDI K, K, 8 - PTR_ALSL Y, INC_Y, Y, 3 - bnez I, .L_\XW\()_N_4_M_L8 -.L_\XW\()_N_4_M_7: - andi I, M, 4 - beqz I, .L_\XW\()_N_4_M_3 - DLOAD_\Y_4 - DGEMV_N_4x4 - DSTORE_\Y_4 - PTR_ALSL Y, INC_Y, Y, 2 - PTR_ADDI K, K, 4 -.L_\XW\()_N_4_M_3: - andi I, M, 3 - beqz I, .L_\XW\()_N_4_M_END -.align 5 -.L_\XW\()_N_4_M_L1: - DLOAD_\Y_1 - DGEMV_N_1x4 - DSTORE_\Y_1 - PTR_ADDI I, I, -1 - PTR_ADD Y, Y, INC_Y - PTR_ADDI K, K, 1 - bnez I, .L_\XW\()_N_4_M_L1 -.L_\XW\()_N_4_M_END: - PTR_SLLI K_LDA, LDA, 2 - PTR_SUB K_LDA, K_LDA, M8 -#if __loongarch_grlen == 64 - GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -#else - GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -#endif - PTR_ALSL X, INC_X, X, 2 -.L_\XW\()_N_3: - andi J, N, 2 - beqz J, .L_\XW\()_N_1 - DLOAD_\X_2 - xor K, K, K - move Y, Y_ORG - PTR_SRLI I, M, 3 - beqz I, .L_\XW\()_N_2_M_7 -.align 5 -.L_\XW\()_N_2_M_L8: - DLOAD_\Y_8 - DGEMV_N_8x2 - DSTORE_\Y_8 - PTR_ADDI I, I, -1 - PTR_ADDI K, K, 8 - PTR_ALSL Y, INC_Y, Y, 3 - bnez I, .L_\XW\()_N_2_M_L8 -.L_\XW\()_N_2_M_7: - andi I, M, 4 - beqz I, .L_\XW\()_N_2_M_3 - DLOAD_\Y_4 - DGEMV_N_4x2 - DSTORE_\Y_4 - PTR_ALSL Y, INC_Y, Y, 2 - PTR_ADDI K, K, 4 -.L_\XW\()_N_2_M_3: - andi I, M, 3 - beqz I, .L_\XW\()_N_2_M_END -.align 5 -.L_\XW\()_N_2_M_L1: - DLOAD_\Y_1 - DGEMV_N_1x2 - DSTORE_\Y_1 - PTR_ADDI I, I, -1 - PTR_ADD Y, Y, INC_Y - PTR_ADDI K, K, 1 - bnez I, .L_\XW\()_N_2_M_L1 -.L_\XW\()_N_2_M_END: - PTR_SLLI K_LDA, LDA, 1 - PTR_SUB K_LDA, K_LDA, M8 - PTR_ADD PA0, PA0, K_LDA - PTR_ADD PA1, PA1, K_LDA - PTR_ALSL X, INC_X, X, 1 -.L_\XW\()_N_1: - andi J, N, 1 - beqz J, .L_END - DLOAD_\X_1 - xor K, K, K - move Y, Y_ORG - move I, M - beqz I, .L_END -.align 5 -.L_\XW\()_N_1_M_L1: - DLOAD_\Y_1 - DGEMV_N_1x1 - DSTORE_\Y_1 - PTR_ADDI I, I, -1 - PTR_ADD Y, Y, INC_Y - PTR_ADDI K, K, 1 - bnez I, .L_\XW\()_N_1_M_L1 - b .L_END -.endm - - PROLOGUE - PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 7, 24 + 4 - PTR_ADDI K, $r0, 0x01 - PTR_SUB I, INC_X, K - PTR_SUB J, INC_Y, K - maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ - maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ - PTR_ALSL I, I, J, 1 - GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 - xvreplve0.d VALPHA, $xr0 - move Y_ORG, Y - move PA0, A -#if __loongarch_grlen == 64 - GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ - PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -#else - GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ - PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -#endif - la.local T0, .L_GAP_TABLE - PTR_ALSL I, I, T0, 1 - ld.h K, I, 0 - PTR_ADD T0, T0, K - jirl $r0, T0, 0 -.L_GAP_TABLE: - .hword .L_GAP_0_0 - .L_GAP_TABLE - .hword .L_GAP_0_1 - .L_GAP_TABLE - .hword .L_GAP_1_0 - .L_GAP_TABLE - .hword .L_GAP_1_1 - .L_GAP_TABLE -.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ - DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 -.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ - DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 -.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ - DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 -.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ - DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 -.L_END: - pop_if_used 17 + 7, 24 + 4 - jirl $r0, $r1, 0x0 - EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S deleted file mode 100644 index be90cb1af..000000000 --- a/kernel/loongarch64/dgemv_t_8_lasx.S +++ /dev/null @@ -1,468 +0,0 @@ -/******************************************************************************* -Copyright (c) 2023, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#define ASSEMBLER - -#include "common.h" -#include "loongarch64_asm.S" - -/********************************************************************* -* 2023/07/17 guxiwei -* UTEST : OK -* CTEST : OK -* TEST : OK -* -* -*********************************************************************/ - -/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, - * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) - */ -#define M $r4 -#define N $r5 -#define ALPHA $f0 -#define A $r7 -#define LDA $r8 -#define X $r9 -#define INC_X $r10 -#define Y $r11 -#define INC_Y $r6 - -#define J $r12 -#define I $r13 -#define K $r14 -#define PY0 $r14 -#define X_ORG $r15 -#define PY1 $r16 -#define K_LDA $r17 -#define PY2 $r18 -#define T0 $r19 -#define PA0 $r20 -#define PA1 $r23 -#define PA2 $r24 -#define PA3 $r25 -#define PA4 $r26 -#define PA5 $r27 -#define PA6 $r28 -#define PA7 $r29 -#define M8 $r30 - -#define VALPHA $xr0 -#define X0 $xr1 -#define X1 $xr2 -#define A0 $xr3 -#define A1 $xr4 -#define A2 $xr5 -#define A3 $xr6 -#define A4 $xr7 -#define A5 $xr8 -#define A6 $xr9 -#define A7 $xr10 -#define A8 $xr11 -#define A9 $xr12 -#define A10 $xr13 -#define A11 $xr14 -#define A12 $xr15 -#define A13 $xr16 -#define A14 $xr17 -#define A15 $xr18 -#define TP0 $xr19 -#define TP1 $xr20 -#define TP2 $xr21 -#define TP3 $xr22 -#define TP4 $xr23 -#define TP5 $xr24 -#define TP6 $xr25 -#define TP7 $xr26 -#define Y0 $xr3 -#define Y1 $xr4 -#define Y2 $xr5 -#define Y3 $xr6 -#define Y4 $xr7 -#define Y5 $xr8 -#define Y6 $xr9 -#define Y7 $xr10 - -.macro ZERO_Y8 - GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ - TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 -.endm - -.macro ZERO_Y4 - GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 -.endm - -.macro ZERO_Y2 - GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 -.endm - -.macro ZERO_Y1 - GXOR xv, v, TP0, TP0, TP0 -.endm - -.macro DLOAD_X8 - GLD xv, , X0, X, 0x00, X1, X, 0x20 -.endm - -.macro DLOAD_X4 - GLD xv, , X0, X, 0x00 -.endm - -.macro DLOAD_X8_GAP - fld.d $f1, X, 0x00 - fldx.d $f2, X, INC_X - PTR_ALSL T0, INC_X, X, 1 - fld.d $f3, T0, 0x00 - fldx.d $f4, T0, INC_X - GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 - PTR_ALSL T0, INC_X, X, 2 - fld.d $f2, T0, 0x00 - fldx.d $f3, T0, INC_X - PTR_ALSL T0, INC_X, T0, 1 - fld.d $f4, T0, 0x00 - fldx.d $f5, T0, INC_X - GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 -.endm - -.macro DLOAD_X4_GAP - fld.d $f1, X, 0x00 - fldx.d $f2, X, INC_X - PTR_ALSL T0, INC_X, X, 1 - fld.d $f3, T0, 0x00 - fldx.d $f4, T0, INC_X - GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 -.endm - -.macro DGEMV_T_8x8 - GLD_INC xv, , 0x20, \ - A0, PA0, 0, A1, PA0, 0, \ - A2, PA1, 0, A3, PA1, 0, \ - A4, PA2, 0, A5, PA2, 0, \ - A6, PA3, 0, A7, PA3, 0, \ - A8, PA4, 0, A9, PA4, 0, \ - A10, PA5, 0, A11, PA5, 0, \ - A12, PA6, 0, A13, PA6, 0, \ - A14, PA7, 0, A15, PA7, 0 - - GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ - TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ - TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ - TP3, A6, X0, TP3, TP3, A7, X1, TP3, \ - TP4, A8, X0, TP4, TP4, A9, X1, TP4, \ - TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ - TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ - TP7, A14, X0, TP7, TP7, A15, X1, TP7 -.endm - -.macro DGEMV_T_8x4 - GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \ - A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 - - GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ - TP2, A4, X0, TP2, TP3, A6, X0, TP3, \ - TP4, A8, X0, TP4, TP5, A10, X0, TP5, \ - TP6, A12, X0, TP6, TP7, A14, X0, TP7, -.endm - -.macro DGEMV_T_4x8 - GLD_INC xv, , 0x20, \ - A0, PA0, 0, A1, PA0, 0, \ - A2, PA1, 0, A3, PA1, 0, \ - A4, PA2, 0, A5, PA2, 0, \ - A6, PA3, 0, A7, PA3, 0 - - GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ - TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ - TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ - TP3, A6, X0, TP3, TP3, A7, X1, TP3 -.endm - -.macro DGEMV_T_4x4 - GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 - - GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ - TP2, A4, X0, TP2, TP3, A6, X0, TP3 -.endm - -.macro DGEMV_T_2x8 - GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0 - - GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ - TP1, A2, X0, TP1, TP1, A3, X1, TP1 -.endm - -.macro DGEMV_T_2x4 - GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 - - GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 -.endm - -.macro DGEMV_T XW:req X8:req, X4:req - PTR_SRLI J, N, 3 - beqz J, .L_\XW\()_N_7 - PTR_SLLI K_LDA, LDA, 3 - PTR_SUB K_LDA, K_LDA, M8 -.L_\XW\()_N_L8: - ZERO_Y8 - move X, X_ORG - PTR_SRLI I, M, 3 - beqz I, .L_\XW\()_M_7 -.align 5 -.L_\XW\()_M_L8: - DLOAD_\X8 - DGEMV_T_8x8 - PTR_ADDI I, I, -1 - PTR_ALSL X, INC_X, X, 3 - bnez I, .L_\XW\()_M_L8 -.L_\XW\()_M_7: - andi I, M, 4 - beqz I, .L_\XW\()_M_3 - DLOAD_\X4 - DGEMV_T_8x4 - PTR_ALSL X, INC_X, X, 2 -.L_\XW\()_M_3: - // Accumulated - GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ - Y5, TP5, Y6, TP6, Y7, TP7 - andi I, M, 3 - beqz I, .L_\XW\()_M_END -.align 5 -.L_\XW\()_M_L1: - fld.d $f1, X, 0x00 - fld.d $f11, PA0, 0x00 - fld.d $f12, PA1, 0x00 - fld.d $f13, PA2, 0x00 - fld.d $f14, PA3, 0x00 - fld.d $f15, PA4, 0x00 - fld.d $f16, PA5, 0x00 - fld.d $f17, PA6, 0x00 - fld.d $f18, PA7, 0x00 -#if __loongarch_grlen == 64 - GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ - PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 -#else - GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ - PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 -#endif - GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ - $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 - PTR_ADDI I, I, -1 - PTR_ADD X, X, INC_X - bnez I, .L_\XW\()_M_L1 -.L_\XW\()_M_END: - fld.d $f11, Y, 0x00 - fldx.d $f12, Y, INC_Y - PTR_ALSL PY0, INC_Y, Y, 1 - fld.d $f13, PY0, 0x00 - fldx.d $f14, PY0, INC_Y - PTR_ALSL PY1, INC_Y, Y, 2 - fld.d $f15, PY1, 0x00 - fldx.d $f16, PY1, INC_Y - PTR_ALSL PY2, INC_Y, PY1, 1 - fld.d $f17, PY2, 0x00 - fldx.d $f18, PY2, INC_Y - - GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ - $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 - - PTR_ADDI J, J, -1 -#if __loongarch_grlen == 64 - GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ - PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -#else - GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ - PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -#endif - fst.d $f11, Y, 0x00 - fstx.d $f12, Y, INC_Y - fst.d $f13, PY0, 0x00 - fstx.d $f14, PY0, INC_Y - fst.d $f15, PY1, 0x00 - fstx.d $f16, PY1, INC_Y - fst.d $f17, PY2, 0x00 - fstx.d $f18, PY2, INC_Y - PTR_ALSL Y, INC_Y, Y, 3 - bnez J, .L_\XW\()_N_L8 -.L_\XW\()_N_7: - andi J, N, 4 - beqz J, .L_\XW\()_N_3 - ZERO_Y4 - move X, X_ORG - PTR_SRLI I, M, 3 - beqz I, .L_\XW\()_N_4_M_7 -.align 5 -.L_\XW\()_N_4_M_L8: - DLOAD_\X8 - DGEMV_T_4x8 - PTR_ADDI I, I, -1 - PTR_ALSL X, INC_X, X, 3 - bnez I, .L_\XW\()_N_4_M_L8 -.L_\XW\()_N_4_M_7: - andi I, M, 4 - beqz I, .L_\XW\()_N_4_M_3 - DLOAD_\X4 - DGEMV_T_4x4 - PTR_ALSL X, INC_X, X, 2 -.L_\XW\()_N_4_M_3: - // Accumulated - GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 - andi I, M, 3 - beqz I, .L_\XW\()_N_4_M_END -.align 5 -.L_\XW\()_N_4_M_L1: - fld.d $f1, X, 0x00 - GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 - GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 - PTR_ADDI I, I, -1 - PTR_ADD X, X, INC_X - bnez I, .L_\XW\()_N_4_M_L1 -.L_\XW\()_N_4_M_END: - fld.d $f11, Y, 0x00 - fldx.d $f12, Y, INC_Y - PTR_ALSL PY0, INC_Y, Y, 1 - fld.d $f13, PY0, 0x00 - fldx.d $f14, PY0, INC_Y - - GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 - - PTR_SLLI K_LDA, LDA, 2 - PTR_SUB K_LDA, K_LDA, M8 - -#if __loongarch_grlen == 64 - GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -#else - GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -#endif - fst.d $f11, Y, 0x00 - fstx.d $f12, Y, INC_Y - fst.d $f13, PY0, 0x00 - fstx.d $f14, PY0, INC_Y - PTR_ALSL Y, INC_Y, Y, 2 -.L_\XW\()_N_3: - andi J, N, 2 - beqz J, .L_\XW\()_N_1 - ZERO_Y2 - move X, X_ORG - PTR_SRLI I, M, 3 - beqz I, .L_\XW\()_N_2_M_7 -.align 5 -.L_\XW\()_N_2_M_L8: - DLOAD_\X8 - DGEMV_T_2x8 - PTR_ADDI I, I, -1 - PTR_ALSL X, INC_X, X, 3 - bnez I, .L_\XW\()_N_2_M_L8 -.L_\XW\()_N_2_M_7: - andi I, M, 4 - beqz I, .L_\XW\()_N_2_M_3 - DLOAD_\X4 - DGEMV_T_2x4 - PTR_ALSL X, INC_X, X, 2 -.L_\XW\()_N_2_M_3: - // Accumulated - GACC xvf, d, Y0, TP0, Y1, TP1 - andi I, M, 3 - beqz I, .L_\XW\()_N_2_M_END -.align 5 -.L_\XW\()_N_2_M_L1: - fld.d $f1, X, 0x00 - GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 - GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 - PTR_ADDI I, I, -1 - PTR_ADD X, X, INC_X - bnez I, .L_\XW\()_N_2_M_L1 -.L_\XW\()_N_2_M_END: - fld.d $f11, Y, 0x00 - fldx.d $f12, Y, INC_Y - - GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 - - PTR_SLLI K_LDA, LDA, 1 - PTR_SUB K_LDA, K_LDA, M8 - -#if __loongarch_grlen == 64 - GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA -#else - GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA -#endif - fst.d $f11, Y, 0x00 - fstx.d $f12, Y, INC_Y - PTR_ALSL Y, INC_Y, Y, 1 -.L_\XW\()_N_1: - andi J, N, 1 - beqz J, .L_END - ZERO_Y1 - move X, X_ORG - move I, M - beqz I, .L_END -.align 5 -.L_\XW\()_N_1_M_L1: - fld.d $f3, PA0, 0x00 - fld.d $f1, X, 0x00 - fmadd.d $f19, $f3, $f1, $f19 - PTR_ADDI I, I, -1 - PTR_ADD X, X, INC_X - PTR_ADDI PA0, PA0, 0x08 - bnez I, .L_\XW\()_N_1_M_L1 - fld.d $f3, Y, 0x00 - fmadd.d $f3, ALPHA, $f19, $f3 - fst.d $f3, Y, 0x00 - b .L_END -.endm - - PROLOGUE - PTR_LD INC_Y, $sp, 0 - push_if_used 17 + 8, 24 + 3 - PTR_ADDI K, $r0, 0x01 - PTR_SUB I, INC_X, K - maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ - GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 - xvreplve0.d VALPHA, $xr0 - move X_ORG, X - move PA0, A -#if __loongarch_grlen == 64 - GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ - PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -#else - GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ - PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -#endif - la.local T0, .L_GAP_TABLE - PTR_ALSL I, I, T0, 1 - ld.h K, I, 0 - PTR_ADD T0, T0, K - jirl $r0, T0, 0 -.L_GAP_TABLE: - .hword .L_GAP_0 - .L_GAP_TABLE - .hword .L_GAP_1 - .L_GAP_TABLE -.L_GAP_0: /* if (incx == 1) */ - DGEMV_T GAP_0, X8, X4 -.L_GAP_1: /* if (incx != 1) */ - DGEMV_T GAP_1, X8_GAP, X4_GAP -.L_END: - pop_if_used 17 + 8, 24 + 3 - jirl $r0, $r1, 0x0 - EPILOGUE diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S deleted file mode 100644 index 8876cbed9..000000000 --- a/kernel/loongarch64/loongarch64_asm.S +++ /dev/null @@ -1,313 +0,0 @@ -/******************************************************************************* -Copyright (c) 2023, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ - -#if __loongarch_grlen == 64 -#define LA_REG int64_t -#define REG_SIZE 8 -#define REG_LOG 3 -#define PTR_ADDI addi.d -#define PTR_ADD add.d -#define PTR_SUB sub.d -#define PTR_LD ld.d -#define PTR_ST st.d -#define PTR_SLLI slli.d -#define PTR_SRLI srli.d -#define PTR_ALSL alsl.d -#else -#define LA_REG int32_t -#define REG_SIZE 4 -#define REG_LOG 2 -#define PTR_ADDI addi.w -#define PTR_ADD add.w -#define PTR_SUB sub.w -#define PTR_LD ld.w -#define PTR_ST st.w -#define PTR_SLLI slli.w -#define PTR_SRLI srli.w -#define PTR_ALSL alsl.w -#endif - -#if __loongarch_frlen == 64 -#define FREG_SIZE 8 -#define FREG_LOG 3 -#define PTR_FLD fld.d -#define PTR_FST fst.d -#else -#define FREG_SIZE 4 -#define FREG_LOG 2 -#define PTR_FLD fld.s -#define PTR_FST fst.s -#endif - -// The max registers available to the user which -// do not need to be preserved across calls. -// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html -#define MAX_INT_CALLER_SAVED 17 -#define MAX_FP_CALLER_SAVED 24 - -.altmacro // Enable alternate macro mode - -.macro push_if_used regs, fregs -.if \regs > MAX_INT_CALLER_SAVED - PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) - push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 -.endif -.if \fregs > MAX_FP_CALLER_SAVED - PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) - push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 -.endif -.endm // End push_if_used -.macro pop_if_used regs, fregs -.if \fregs > MAX_FP_CALLER_SAVED - pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 - PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG -.endif -.if \regs > MAX_INT_CALLER_SAVED - pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 - PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG -.endif -.endm // End pop_if_used -.macro push_regs from, to - PTR_ST $s\()\from, $sp, \from << REG_LOG -.if \to - \from - push_regs %from + 1, \to -.endif -.endm // End push_regs -.macro pop_regs from, to - PTR_LD $s\()\from, $sp, \from << REG_LOG -.if \to - \from - pop_regs %from + 1, \to -.endif -.endm // End pop_regs -.macro push_fregs from, to - PTR_FST $fs\()\from, $sp, \from << FREG_LOG -.if \to - \from - push_fregs %from + 1, \to -.endif -.endm // End push_fregs -.macro pop_fregs from, to - PTR_FLD $fs\()\from, $sp, \from << FREG_LOG -.if \to - \from - pop_fregs %from + 1, \to -.endif -.endm // End pop_fregs - -// -// Instruction Related Macros -// -// GLD -// -.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg -.ifeqs "\suf_op", "0" - \pre_op\()ld \out, \src, \offset -.else - \pre_op\()ld.\suf_op \out, \src, \offset -.endif -.ifnb \more - GLD \pre_op, \suf_op, \more -.endif -.endm - -// -// GLD_INC -// -.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg -.ifeqs "\suf_op", "0" - \pre_op\()ld \out, \src, \offset -.else - \pre_op\()ld.\suf_op \out, \src, \offset -.endif - PTR_ADDI \src, \src, \inc -.ifnb \more - GLD_INC \pre_op, \suf_op, \inc, \more -.endif -.endm -// -// GLDX is same as GLD except the stride is a register -// -.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg -.ifeqs "\suf_op", "0" - \pre_op\()ldx \out, \src, \offset -.else - \pre_op\()ldx.\suf_op \out, \src, \offset -.endif -.ifnb \more - GLDX \pre_op, \suf_op, \more -.endif -.endm -// -// GLDREPL -// -.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg - \pre_op\()ldrepl.\suf_op \out, \src, \offset -.ifnb \more - GLDREPL \pre_op, \suf_op, \more -.endif -.endm -// -// GST -// -.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg -.ifeqs "\suf_op", "0" - \pre_op\()st \src, \dst, \offset -.else - \pre_op\()st.\suf_op \src, \dst, \offset -.endif -.ifnb \more - GST \pre_op, \suf_op, \more -.endif -.endm -// -// GMUL -// -.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg - \pre_op\()mul.\suf_op \out, \in0, \in1 -.ifnb \more - GMUL \pre_op, \suf_op, \more -.endif -.endm -// -// GMADD -// -.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg - \pre_op\()madd.\suf_op \out, \in0, \in1, \in2 -.ifnb \more - GMADD \pre_op, \suf_op, \more -.endif -.endm -// -// GADD -// -.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg - \pre_op\()add.\suf_op \out, \in0, \in1 -.ifnb \more - GADD \pre_op, \suf_op, \more -.endif -.endm -// -// GADDI -// -.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg - \pre_op\()addi.\suf_op \out, \in0, \in1 -.ifnb \more - GADDI \pre_op, \suf_op, \more -.endif -.endm -// -// GSLLI -// -.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg - \pre_op\()slli.\suf_op \out, \in0, \in1 -.ifnb \more - GSLLI \pre_op, \suf_op, \more -.endif -.endm -// -// GINSVE0 -// -.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg - \pre_op\()insve0.\suf_op \out, \in0, \in1 -.ifnb \more - GINSVE0 \pre_op, \suf_op, \more -.endif -.endm -// -// GXOR -// -.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg - \pre_op\()xor.\suf_op \out, \in0, \in1 -.ifnb \more - GXOR \pre_op, \suf_op, \more -.endif -.endm - -// -// Compound instructions -// -// GACC: Accumulate the values of vector registers -// -.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg -.ifeqs "\pre_op", "xvf" - xvpermi.q \out, \in, 0x01 - \pre_op\()add.\suf_op \in, \out, \in - xvpackod.d \out, \in, \in - \pre_op\()add.\suf_op \out, \out, \in -.ifeqs "\suf_op", "s" - xvpackod.w \in, \out, \out - \pre_op\()add.\suf_op \out, \out, \in -.endif -.endif - -.ifeqs "\pre_op", "vf" - vpackod.d \out, \in, \in - \pre_op\()add.\suf_op \out, \out, \in -.ifeqs "\suf_op", "s" - vpackod.w \in, \out, \out - \pre_op\()add.\suf_op \out, \out, \in -.endif -.endif - -.ifeqs "\pre_op", "xv" - xvpermi.q \out, \in, 0x01 - \pre_op\()add.\suf_op \in, \out, \in - xvpackod.d \out, \in, \in - \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "d" - xvpackod.w \in, \out, \out - \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "w" - xvpackod.h \in, \out, \out - \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "h" - xvpackod.b \in, \out, \out - \pre_op\()add.\suf_op \out, \out, \in -.endif -.endif -.endif -.endif - -.ifeqs "\pre_op", "v" - vpackod.d \out, \in, \in - \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "d" - vpackod.w \in, \out, \out - \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "w" - vpackod.h \in, \out, \out - \pre_op\()add.\suf_op \out, \out, \in -.ifnc "\suf_op", "h" - vpackod.b \in, \out, \out - \pre_op\()add.\suf_op \out, \out, \in -.endif -.endif -.endif -.endif - -.ifnb \more - GACC \pre_op, \suf_op, \more -.endif -.endm From 71fcee6eef23bd058d596c42f2d90494a629b401 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 29 Jun 2023 11:11:08 +0800 Subject: [PATCH 257/718] LoongArch64: Update dgemm kernel --- kernel/loongarch64/dgemm_kernel_16x4.S | 4058 ++++++++++-------------- 1 file changed, 1664 insertions(+), 2394 deletions(-) diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S index 13faa977e..f8e26fda2 100644 --- a/kernel/loongarch64/dgemm_kernel_16x4.S +++ b/kernel/loongarch64/dgemm_kernel_16x4.S @@ -28,6 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +/********************************************************************* +* 2023/06/28 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2023/06/28 guxiwei +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 4 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 32 +* DGEMM_DEFAULT_Q 152 +* DGEMM_DEFAULT_R 858 +* A_PR1 1024 +* B_PR1 256 +* +* +* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: +* 1 thread: 36.0 GFLOPS +* 2 threads: 71.6 GFLOPS +* 3 threads: 101.5 GFLOPS +* 4 threads: 132.8 GFLOPS +*********************************************************************/ + /* Function parameters */ #define M $r4 // param 1: bm #define N $r5 // param 2: bn @@ -68,1290 +93,1331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define U4 $xr4 #define U5 $xr5 #define U6 $xr6 -#define D0 $xr7 -#define D1 $xr8 -#define D2 $xr9 -#define D3 $xr10 -#define D4 $xr11 -#define D5 $xr12 -#define D6 $xr13 -#define D7 $xr14 -#define D8 $xr15 -#define D9 $xr16 -#define D10 $xr17 -#define D11 $xr18 -#define D12 $xr19 -#define D13 $xr20 -#define D14 $xr21 -#define D15 $xr22 -#define VALPHA $xr23 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 +#define VALPHA $xr15 /* Prefetch interval */ -#define A_PRE 0x200 +#define A_PRE 0x400 #define B_PRE 0x100 - PROLOGUE - - addi.d $sp, $sp, -56 - /* Store regs */ - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 24 - SDARG $r27, $sp, 32 - ST $f23, $sp, 40 - ST ALPHA, $sp, 48 - - /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ - xvld VALPHA, $sp, 48 - xvreplve0.d VALPHA, VALPHA - -#if defined (TRMMKERNEL) && !defined(LEFT) - sub.d OFF, ZERO, OFFSET -#else - xor OFF, OFF, OFF -#endif - - /* if (!(N >> 2)) goto L_N3 */ - srai.d J, N, 2 /* J = bn >> 2 */ - andi N, N, 0x03 - beq ZERO, J, .L_N3 - -.L_J1: /* J-- && This loop include Condition 1 */ - -/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* -* dgemm_core_16x4 */ - move C0, C - move A0, A - slli.d T0, LDC, 3 - add.d C1, C0, T0 - addi.d J, J, -1 /* J-- */ - add.d C2, C1, T0 - add.d C3, C2, T0 - -#if defined(TRMMKERNEL) && defined(LEFT) - move OFF, OFFSET -#endif - - /* if (!(M >> 4)) goto L_M8 */ - srai.d I, M, 4 /* I = bm >> 4 */ - beq ZERO, I, .L_M8 - -.L_I1: /* I-- */ -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x07 - add.d A0, A0, T0 - slli.d T0, OFF, 0x05 - add.d B0, B, T0 -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 16 -#else - /* number of values in B */ - addi.d L, OFF, 4 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - /* Calculate the first set of D0~D15, - * avoidig set 0 operation - * Load 16 * 64 from A0 - * U0 = {a3, a2, a1, a0} - * U1 = {a7, a6, a5, a4} - * U2 = {a11, a10, a9, a8} - * U3 = {a15, a14, a13, a12} - */ +.macro KERNEL2x16x4 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - preld 0, C0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 - preld 0, C0, 0x40 - xvfmul.d D2, U2, U4 - xvfmul.d D3, U3, U4 - - xvldrepl.d U4, B0, 0x08 - preld 0, C1, 0x00 - /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 - preld 0, C1, 0x40 - xvfmul.d D6, U2, U4 - xvfmul.d D7, U3, U4 - - xvldrepl.d U4, B0, 0x10 - preld 0, C2, 0x00 - /* line 3 */ - xvfmul.d D8, U0, U4 - xvfmul.d D9, U1, U4 - preld 0, C2, 0x40 - xvfmul.d D10, U2, U4 - xvfmul.d D11, U3, U4 - - xvldrepl.d U4, B0, 0x18 - preld 0, C3, 0x00 - /* line 4 */ - xvfmul.d D12, U0, U4 - xvfmul.d D13, U1, U4 - preld 0, C3, 0x40 - xvfmul.d D14, U2, U4 - xvfmul.d D15, U3, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_L7 */ - beq ZERO,TL, .L_L7 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - /* Calculate 8 sets of D0~D15 */ -.L_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + preld 0, B0, B_PRE + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 +.endm - /***8-3***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL2x16x4_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + preld 0, B0, B_PRE + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 +.macro KERNEL8x16x4 +.rept 4 + KERNEL2x16x4 +.endr +.endm - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 +.macro KERNEL8x16x4_END +.rept 3 + KERNEL2x16x4 +.endr + KERNEL2x16x4_END +.endm - /***8-5***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL2x8x4 xvld U0, A0, 0x00 xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 + xvldrepl.d U12, B0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 +.endm - /***8-7***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL2x8x4_END xvld U0, A0, 0x00 xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 +.endm - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm - /***8-8***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE + xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_TL1 + xvld U8, A0, 0x00 - /* Maybe we need calculate the last - * 7 sets of D0~D15? - */ -.L_L7: - /* if (!(L & 7)) goto L_L0 */ - andi TL, L, 7 - beq TL, ZERO,.L_L0 + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 -.L_L71: - /* Load 16 * 64 from A0 */ + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 - addi.d TL, TL, -1 - blt ZERO,TL, .L_L71 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D12, U0, U7, D12 +.endm -.L_L0: -#if defined(TRMMKERNEL) - xvfmul.d D0, D0, VALPHA - xvfmul.d D1, D1, VALPHA - xvfmul.d D2, D2, VALPHA - xvfmul.d D3, D3, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D5, D5, VALPHA - xvfmul.d D6, D6, VALPHA - xvfmul.d D7, D7, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D9, D9, VALPHA - xvfmul.d D10, D10, VALPHA - xvfmul.d D11, D11, VALPHA - xvfmul.d D12, D12, VALPHA - xvfmul.d D13, D13, VALPHA - xvfmul.d D14, D14, VALPHA - xvfmul.d D15, D15, VALPHA -#else - /* Load C0 */ - xvld U0, C0, 0x00 - xvld U1, C0, 0x20 - xvld U2, C0, 0x40 - xvld U3, C0, 0x60 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - xvfmadd.d D1, D1, VALPHA, U1 - xvfmadd.d D2, D2, VALPHA, U2 - xvfmadd.d D3, D3, VALPHA, U3 +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm - /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvld U2, C1, 0x40 - xvld U3, C1, 0x60 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 - xvfmadd.d D6, D6, VALPHA, U2 - xvfmadd.d D7, D7, VALPHA, U3 +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm - /* Load C2 */ - xvld U0, C2, 0x00 - xvld U1, C2, 0x20 - xvld U2, C2, 0x40 - xvld U3, C2, 0x60 - xvfmadd.d D8, D8, VALPHA, U0 - xvfmadd.d D9, D9, VALPHA, U1 - xvfmadd.d D10, D10, VALPHA, U2 - xvfmadd.d D11, D11, VALPHA, U3 +.macro KERNEL2x2x4 + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 - /* Load C3 */ - xvld U0, C3, 0x00 - xvld U1, C3, 0x20 - xvld U2, C3, 0x40 - xvld U3, C3, 0x60 - xvfmadd.d D12, D12, VALPHA, U0 - xvfmadd.d D13, D13, VALPHA, U1 - xvfmadd.d D14, D14, VALPHA, U2 - xvfmadd.d D15, D15, VALPHA, U3 -#endif // #if defined(TRMMKERNEL) + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - /* Store C0 */ - xvst D0, C0, 0x00 - xvst D1, C0, 0x20 - xvst D2, C0, 0x40 - xvst D3, C0, 0x60 - /* Store C1 */ - xvst D4, C1, 0x00 - xvst D5, C1, 0x20 - xvst D6, C1, 0x40 - xvst D7, C1, 0x60 - /* Store C2 */ - xvst D8, C2, 0x00 - xvst D9, C2, 0x20 - xvst D10, C2, 0x40 - xvst D11, C2, 0x60 - /* Store C3 */ - xvst D12, C3, 0x00 - xvst D13, C3, 0x20 - xvst D14, C3, 0x40 - xvst D15, C3, 0x60 + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 - /* Add stride for C */ - addi.d C0, C0, 0x80 - addi.d C1, C1, 0x80 - addi.d C2, C2, 0x80 - addi.d C3, C3, 0x80 + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - sub.d L, K, OFF -#ifdef LEFT - /* number of values in A */ - addi.d L, L, -16 -#else - /* number of values in B */ - addi.d L, L, -4 -#endif - slli.d T0, L, 0x07 - add.d A0, A0, T0 - slli.d T0, L, 0x05 - add.d B0, B0, T0 -#endif + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 -#ifdef LEFT - addi.d OFF, OFF, 0x10 -#endif -#endif // #if defined(TRMMKERNEL) + xvld U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm - addi.d I, I, -1 /* I-- */ - blt ZERO,I, .L_I1 +.macro KERNEL2x2x4_END + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 -.L_M8: - /* We have done M & 16, considering M=8/4/2/1 */ - andi I, M, 15 - beq ZERO,I, .L_M0 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - andi I, M, 8 - beq ZERO,I, .L_M4 + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x06 - add.d A0, A0, T0 - slli.d T0, OFF, 0x05 - add.d B0, B, T0 -#endif + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 8 -#else - /* number of values in B */ - addi.d L, OFF, 4 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif // #if defined(TRMMKERNEL) +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm - /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 +.macro KERNEL2x1x4 + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 - xvldrepl.d U4, B0, 0x10 - /* line 3 */ - xvfmul.d D8, U0, U4 - xvfmul.d D9, U1, U4 + xvldrepl.d U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvld U12, B0, 0x00 - xvldrepl.d U4, B0, 0x18 - /* line 4 */ - xvfmul.d D12, U0, U4 - xvfmul.d D13, U1, U4 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 +.endm - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M8_L7 */ - beq ZERO,TL, .L_M8_L7 +.macro KERNEL2x1x4_END + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 -.L_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x16x2 xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm - /***8-2***/ +.macro KERNEL2x16x2_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 +.endm - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 +.macro KERNEL8x16x2 +.rept 4 + KERNEL2x16x2 +.endr +.endm + +.macro KERNEL8x16x2_END +.rept 3 + KERNEL2x16x2 +.endr + KERNEL2x16x2_END +.endm - /***8-3***/ +.macro KERNEL2x8x2 xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U9, A0, 0x20 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + addi.d B0, B0, 0x10 +.endm - /***8-4***/ +.macro KERNEL2x8x2_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 +.endm - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 + .endr + KERNEL2x8x2_END +.endm - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - /***8-5***/ +.macro KERNEL2x4x2 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvldrepl.d U5, B0, 0x08 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 +.endm - /***8-6***/ +.macro KERNEL2x4x2_END xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvldrepl.d U5, B0, 0x08 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm - /***8-7***/ +.macro KERNEL2x2x2 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D4, U0, U5, D4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL2x2x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm - /***8-8***/ +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D4, U0, U5, D4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL2x1x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M8_TL1 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 -.L_M8_L7: - /* if (!(L & 7)) goto L_M8_L0 */ - andi TL, L, 7 - beq TL, ZERO,.L_M8_L0 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm -.L_M8_L71: +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x16x1 xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvldrepl.d U12, B0, 0x00 - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 +.endm - addi.d TL, TL, -1 - blt ZERO,TL, .L_M8_L71 +.macro KERNEL2x16x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 -.L_M8_L0: -#if defined(TRMMKERNEL) - xvfmul.d D0, D0, VALPHA - xvfmul.d D1, D1, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D5, D5, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D9, D9, VALPHA - xvfmul.d D12, D12, VALPHA - xvfmul.d D13, D13, VALPHA -#else - /* Load C0 */ - xvld U0, C0, 0x00 - xvld U1, C0, 0x20 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - xvfmadd.d D1, D1, VALPHA, U1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 - /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 - /* Load C2 */ - xvld U0, C2, 0x00 - xvld U1, C2, 0x20 - xvfmadd.d D8, D8, VALPHA, U0 - xvfmadd.d D9, D9, VALPHA, U1 + xvldrepl.d U4, B0, 0x00 - /* Load C3 */ - xvld U0, C3, 0x00 - xvld U1, C3, 0x20 - xvfmadd.d D12, D12, VALPHA, U0 - xvfmadd.d D13, D13, VALPHA, U1 -#endif // #if defined(TRMMKERNEL) + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 - /* Store C0 */ - xvst D0, C0, 0x00 - xvst D1, C0, 0x20 - /* Store C1 */ - xvst D4, C1, 0x00 - xvst D5, C1, 0x20 - /* Store C2 */ - xvst D8, C2, 0x00 - xvst D9, C2, 0x20 - /* Store C3 */ - xvst D12, C3, 0x00 - xvst D13, C3, 0x20 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 - /* Add stride for C */ - addi.d C0, C0, 0x40 - addi.d C1, C1, 0x40 - addi.d C2, C2, 0x40 - addi.d C3, C3, 0x40 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 +.endm -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - sub.d L, K, OFF -#ifdef LEFT - /* number of values in A */ - addi.d L, L, -8 -#else - /* number of values in B */ - addi.d L, L, -4 -#endif - slli.d T0, L, 0x06 - add.d A0, A0, T0 - slli.d T0, L, 0x05 - add.d B0, B0, T0 -#endif +.macro KERNEL8x16x1 +.rept 4 + KERNEL2x16x1 +.endr +.endm -#ifdef LEFT - /* number of values in A */ - addi.d OFF, OFF, 0x08 -#endif -#endif // #if defined(TRMMKERNEL) +.macro KERNEL8x16x1_END +.rept 3 + KERNEL2x16x1 +.endr + KERNEL2x16x1_END +.endm -/********LOOP (if(N >> 2 ) && (M & 8)) End************/ +.macro KERNEL2x8x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 -.L_M4: - andi I, M, 4 - beq ZERO,I, .L_M2 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x05 - add.d A0, A0, T0 - add.d B0, B, T0 -#endif + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvld U9, A0, 0x20 + xvldrepl.d U12, B0, 0x00 -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 4 -#else - /* number of values in B */ - addi.d L, OFF, 4 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 +.endm - /* Load 4 * 64 from A0 */ +.macro KERNEL2x8x1_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - /* line 3 */ - xvfmul.d D8, U0, U4 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm - xvldrepl.d U4, B0, 0x18 - /* line 4 */ - xvfmul.d D12, U0, U4 +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M4_L7 */ - beq ZERO,TL, .L_M4_L7 +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm -.L_M4_TL1: /* TL-- */ - /***8-1***/ +.macro KERNEL2x4x1 xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + addi.d B0, B0, 0x08 +.endm - /***8-2***/ +.macro KERNEL2x4x1_END xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D0, U0, U4, D0 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm - /***8-3***/ +.macro KERNEL2x2x1 xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm - /***8-4***/ +.macro KERNEL2x2x1_END xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D0, U0, U4, D0 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm - /***8-5***/ +.macro KERNEL2x1x1 xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm - /***8-6***/ +.macro KERNEL2x1x1_END xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + xvfmadd.d D0, U0, U4, D0 +.endm - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + PROLOGUE - /***8-7***/ - xvld U0, A0, 0x00 + addi.d $sp, $sp, -120 + /* Store regs */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA, $sp, 112 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* if (!(N >> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + xvldrepl.d VALPHA, $sp, 112 /* When N < 4, VALPHA will not changed */ + beq ZERO, J, .L_N3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +.L_J1: /* J-- && This loop include Condition 1 */ - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 - /***8-8***/ +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + preld 0, C0, 0x40 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + preld 0, C1, 0x40 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U6, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 + preld 0, C2, 0x40 + xvfmul.d D10, U2, U6 + xvfmul.d D11, U3, U6 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U7, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 + preld 0, C3, 0x40 + xvfmul.d D14, U2, U7 + xvfmul.d D15, U3, U7 - addi.d A0, A0, 0x20 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 + beq ZERO, TL, .L_TL1_END +.L_TL1: /* TL-- */ + KERNEL8x16x4 addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M4_TL1 + blt ZERO,TL, .L_TL1 -.L_M4_L7: - /* if (!(L & 7)) goto L_M4_L0 */ +.L_TL1_END: + KERNEL8x16x4_END + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ andi TL, L, 7 - beq TL, ZERO,.L_M4_L0 + beq TL, ZERO,.L_L0 -.L_M4_L71: +.L_L71: + /* Load 16 * 64 from A0 */ xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 /* Add stride for A0, B0 */ - addi.d A0, A0, 0x20 + addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 addi.d TL, TL, -1 - blt ZERO,TL, .L_M4_L71 + blt ZERO,TL, .L_L71 -.L_M4_L0: +.L_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D10, D10, VALPHA + xvfmul.d D11, D11, VALPHA xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D14, D14, VALPHA + xvfmul.d D15, D15, VALPHA #else /* Load C0 */ xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U4 + xvfmadd.d D5, D5, VALPHA, U5 + xvfmadd.d D6, D6, VALPHA, U6 + xvfmadd.d D7, D7, VALPHA, U7 /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 + xvld U8, C2, 0x00 + xvld U9, C2, 0x20 + xvld U10, C2, 0x40 + xvld U11, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U8 + xvfmadd.d D9, D9, VALPHA, U9 + xvfmadd.d D10, D10, VALPHA, U10 + xvfmadd.d D11, D11, VALPHA, U11 /* Load C3 */ xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvld U2, C3, 0x40 + xvld U3, C3, 0x60 xvfmadd.d D12, D12, VALPHA, U0 -#endif // #if defined(TRMMKERNEL) + xvfmadd.d D13, D13, VALPHA, U1 + xvfmadd.d D14, D14, VALPHA, U2 + xvfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) /* Store C0 */ xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 /* Store C1 */ xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 /* Store C2 */ xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + xvst D10, C2, 0x40 + xvst D11, C2, 0x60 /* Store C3 */ xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + xvst D14, C3, 0x40 + xvst D15, C3, 0x60 /* Add stride for C */ - addi.d C0, C0, 0x20 - addi.d C1, C1, 0x20 - addi.d C2, C2, 0x20 - addi.d C3, C3, 0x20 + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + addi.d C2, C2, 0x80 + addi.d C3, C3, 0x80 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub.d L, K, OFF #ifdef LEFT - /* number of values in A */ - addi.d L, L, -4 + /* number of values in A */ + addi.d L, L, -16 #else /* number of values in B */ addi.d L, L, -4 #endif - slli.d T0, L, 0x05 + slli.d T0, L, 0x07 add.d A0, A0, T0 + slli.d T0, L, 0x05 add.d B0, B0, T0 #endif #ifdef LEFT - /* number of values in A */ - addi.d OFF, OFF, 0x04 + addi.d OFF, OFF, 0x10 #endif #endif // #if defined(TRMMKERNEL) -/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 -.L_M2: - andi I, M, 2 - beq ZERO,I, .L_M1 +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B0, B #else - slli.d T0, OFF, 0x04 + slli.d T0, OFF, 0x06 add.d A0, A0, T0 slli.d T0, OFF, 0x05 add.d B0, B, T0 @@ -1361,7 +1427,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub.d L, K, OFF #elif defined(LEFT) /* number of values in A */ - addi.d L, OFF, 2 + addi.d L, OFF, 8 #else /* number of values in B */ addi.d L, OFF, 4 @@ -1369,262 +1435,163 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else // #if !defined(TRMMKERNEL) move B0, B move L, K /* L = bk */ -#endif +#endif // #if defined(TRMMKERNEL) - /* Load 2 * 64 from A0 */ + /* Load 8 * 64 from A0 */ xvld U0, A0, 0x00 + xvld U1, A0, 0x20 xvldrepl.d U4, B0, 0x00 /* line 1 */ xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 - xvldrepl.d U4, B0, 0x10 + xvldrepl.d U6, B0, 0x10 /* line 3 */ - xvfmul.d D8, U0, U4 + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 - xvldrepl.d U4, B0, 0x18 + xvldrepl.d U7, B0, 0x18 /* line 4 */ - xvfmul.d D12, U0, U4 + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x10 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 /* Reduce L */ addi.d L, L, -1 srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M2_L7 */ - beq ZERO,TL, .L_M2_L7 - -.L_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + addi.d TL, TL, -1 - addi.d A0, A0, 0x10 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + beq ZERO, TL, .L_M8_TL1_END - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 +.L_M8_TL1: /* TL-- */ + KERNEL8x8x4 addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M2_TL1 + blt ZERO,TL, .L_M8_TL1 -.L_M2_L7: - /* if (!(L & 7)) goto L_M2_L0 */ +.L_M8_TL1_END: + KERNEL8x8x4_END + +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ andi TL, L, 7 - beq TL, ZERO,.L_M2_L0 + beq TL, ZERO,.L_M8_L0 -.L_M2_L71: +.L_M8_L71: xvld U0, A0, 0x00 + xvld U1, A0, 0x20 xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 /* Add stride for A0, B0 */ - addi.d A0, A0, 0x10 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 addi.d TL, TL, -1 - blt ZERO,TL, .L_M2_L71 + blt ZERO,TL, .L_M8_L71 -.L_M2_L0: +.L_M8_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA #else /* Load C0 */ xvld U0, C0, 0x00 + xvld U1, C0, 0x20 xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U2 + xvfmadd.d D5, D5, VALPHA, U3 /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U4 + xvfmadd.d D9, D9, VALPHA, U5 /* Load C3 */ - xvld U0, C3, 0x00 - xvfmadd.d D12, D12, VALPHA, U0 + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U6 + xvfmadd.d D13, D13, VALPHA, U7 #endif // #if defined(TRMMKERNEL) - xvstelm.d D0, C0, 0x00, 0x00 - xvstelm.d D4, C1, 0x00, 0x00 - xvstelm.d D8, C2, 0x00, 0x00 - xvstelm.d D12, C3, 0x00, 0x00 - xvstelm.d D0, C0, 0x08, 0x01 - xvstelm.d D4, C1, 0x08, 0x01 - xvstelm.d D8, C2, 0x08, 0x01 - xvstelm.d D12, C3, 0x08, 0x01 + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 /* Add stride for C */ - addi.d C0, C0, 0x10 - addi.d C1, C1, 0x10 - addi.d C2, C2, 0x10 - addi.d C3, C3, 0x10 + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub.d L, K, OFF #ifdef LEFT /* number of values in A */ - addi.d L, L, -2 + addi.d L, L, -8 #else /* number of values in B */ addi.d L, L, -4 #endif - slli.d T0, L, 0x04 + slli.d T0, L, 0x06 add.d A0, A0, T0 slli.d T0, L, 0x05 add.d B0, B0, T0 @@ -1632,23 +1599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef LEFT /* number of values in A */ - addi.d OFF, OFF, 0x02 + addi.d OFF, OFF, 0x08 #endif #endif // #if defined(TRMMKERNEL) -/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ +/********LOOP (if(N >> 2 ) && (M & 8)) End************/ -.L_M1: - andi I, M, 1 - beq ZERO,I, .L_M0 +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B0, B #else - slli.d T0, OFF, 0x03 - add.d A0, A0, T0 slli.d T0, OFF, 0x05 + add.d A0, A0, T0 add.d B0, B, T0 #endif @@ -1656,7 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub.d L, K, OFF #elif defined(LEFT) /* number of values in A */ - addi.d L, OFF, 1 + addi.d L, OFF, 4 #else /* number of values in B */ addi.d L, OFF, 4 @@ -1666,55 +1632,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move L, K /* L = bk */ #endif - /* Load 1 * 64 from A0 */ + /* Load 4 * 64 from A0 */ xvld U0, A0, 0x00 xvldrepl.d U4, B0, 0x00 /* line 1 */ xvfmul.d D0, U0, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 + xvfmul.d D4, U0, U5 - xvldrepl.d U4, B0, 0x10 + xvldrepl.d U6, B0, 0x10 /* line 3 */ - xvfmul.d D8, U0, U4 + xvfmul.d D8, U0, U6 - xvldrepl.d U4, B0, 0x18 + xvldrepl.d U7, B0, 0x18 /* line 4 */ - xvfmul.d D12, U0, U4 + xvfmul.d D12, U0, U7 /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x08 + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 /* Reduce L */ addi.d L, L, -1 srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M1_L7 */ - beq ZERO,TL, .L_M1_L7 + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 -.L_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + beq ZERO, TL, .L_M4_TL1_END - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.L_M4_TL1: /* TL-- */ + KERNEL8x4x4 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 + +.L_M4_TL1_END: + KERNEL8x4x4_END + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 - /***8-2***/ +.L_M4_L71: xvld U0, A0, 0x00 xvldrepl.d U4, B0, 0x00 @@ -1729,119 +1702,287 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x18 xvfmadd.d D12, U0, U4, D12 - addi.d A0, A0, 0x08 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 - /***8-3***/ - xvld U0, A0, 0x00 + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +.L_M4_L0: + xvldrepl.d VALPHA, $sp, 112 +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* Load C1 */ + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + /* Load C2 */ + xvld U2, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U2 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + /* Load C3 */ + xvld U3, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + /* Store C2 */ + xvst D8, C2, 0x00 + /* Store C3 */ + xvst D12, C3, 0x00 - /***8-4***/ - xvld U0, A0, 0x00 + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 - addi.d A0, A0, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvld U4, B0, 0x00 + + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + xvld U12, B0, 0x00 + addi.d A0, A0, 0x10 addi.d B0, B0, 0x20 - /***8-5***/ - xvld U0, A0, 0x00 + beq ZERO, TL, .L_M2_TL1_END +.L_M2_TL1: /* TL-- */ + KERNEL8x2x4 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +.L_M2_L71: + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U4, B0, 0x00 - addi.d A0, A0, 0x08 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 addi.d B0, B0, 0x20 - /***8-6***/ - xvld U0, A0, 0x00 + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +.L_M2_L0: + xvldrepl.d VALPHA, $sp, 112 +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C1, 0x00, 0x01 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D0, C3, 0x00, 0x03 + xvstelm.d D1, C0, 0x08, 0x00 + xvstelm.d D1, C1, 0x08, 0x01 + xvstelm.d D1, C2, 0x08, 0x02 + xvstelm.d D1, C3, 0x08, 0x03 +#else + xvpackev.d D4, D1, D0 + xvpackod.d D5, D1, D0 + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvpermi.q U2, U0, 0x20 + xvpermi.q U3, U1, 0x20 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D0, D4, VALPHA, U2 + xvfmadd.d D1, D5, VALPHA, U3 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + vst $vr16, C0, 0x00 + vst $vr17, C1, 0x00 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D1, C3, 0x00, 0x02 + xvstelm.d D0, C2, 0x08, 0x03 + xvstelm.d D1, C3, 0x08, 0x03 +#endif // #if defined(TRMMKERNEL) - /***8-7***/ - xvld U0, A0, 0x00 + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif - /***8-8***/ - xvld U0, A0, 0x00 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmul.d D0, U0, U4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U8, A0, 0x00 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + addi.d TL, TL, -1 + xvld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + beq ZERO, TL, .L_M1_TL1_END + +.L_M1_TL1: /* TL-- */ + KERNEL8x1x4 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END .L_M1_L7: /* if (!(L & 7)) goto L_M1_L0 */ @@ -1849,19 +1990,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq TL, ZERO,.L_M1_L0 .L_M1_L71: - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 /* Add stride for A0, B0 */ addi.d A0, A0, 0x08 @@ -1871,33 +2002,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt ZERO,TL, .L_M1_L71 .L_M1_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D12, D12, VALPHA + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C1, 0x00, 0x01 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D0, C3, 0x00, 0x03 #else /* Load C0 */ - xvld U0, C0, 0x00 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvldrepl.d U0, C0, 0x00 + xvfmadd.d D4, D0, VALPHA, U0 /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvldrepl.d U1, C1, 0x00 + xvfmadd.d D5, D0, VALPHA, U1 /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 + xvldrepl.d U2, C2, 0x00 + xvfmadd.d D6, D0, VALPHA, U2 /* Load C3 */ - xvld U0, C3, 0x00 - xvfmadd.d D12, D12, VALPHA, U0 -#endif // #if defined(TRMMKERNEL) + xvldrepl.d U3, C3, 0x00 + xvfmadd.d D7, D0, VALPHA, U3 - xvstelm.d D0, C0, 0x00, 0x00 - xvstelm.d D4, C1, 0x00, 0x00 - xvstelm.d D8, C2, 0x00, 0x00 - xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D4, C0, 0x00, 0x00 + xvstelm.d D5, C1, 0x00, 0x01 + xvstelm.d D6, C2, 0x00, 0x02 + xvstelm.d D7, C3, 0x00, 0x03 +#endif // #if defined(TRMMKERNEL) /* Add stride for C */ addi.d C0, C0, 0x08 @@ -1952,6 +2086,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ///////////////////////////////////////////////// /************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ + xvldrepl.d VALPHA, $sp, 112 + .L_N3: andi J, N, 2 beq ZERO, J, .L_N1 @@ -1993,223 +2129,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d L, OFF, 2 #endif #else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 16 * 64 from A0 - * U0 = {a3, a2, a1, a0} - * U1 = {a7, a6, a5, a4} - * U2 = {a11, a10, a9, a8} - * U3 = {a15, a14, a13, a12} - */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 - xvfmul.d D2, U2, U4 - xvfmul.d D3, U3, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 - xvfmul.d D6, U2, U4 - xvfmul.d D7, U3, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N3_L7 */ - beq ZERO,TL, .L_N3_L7 - -.L_N3_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-3***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-5***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 + move B0, B + move L, K /* L = bk */ +#endif - /***8-7***/ - /* Load 16 * 64 from A0 */ + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ xvld U0, A0, 0x00 xvld U1, A0, 0x20 xvld U2, A0, 0x40 xvld U3, A0, 0x60 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 - /***8-8***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x80 addi.d B0, B0, 0x10 + beq ZERO, TL, .L_N3_TL1_END + +.L_N3_TL1: /* TL-- */ + KERNEL8x16x2 + addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x16x2_END .L_N3_L7: /* if (!(L & 7)) goto L_N3_L0 */ @@ -2229,12 +2207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 /* Add stride for A0, B0 */ addi.d A0, A0, 0x80 addi.d B0, B0, 0x10 @@ -2264,14 +2241,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D3, D3, VALPHA, U3 /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvld U2, C1, 0x40 - xvld U3, C1, 0x60 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 - xvfmadd.d D6, D6, VALPHA, U2 - xvfmadd.d D7, D7, VALPHA, U3 + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U4 + xvfmadd.d D5, D5, VALPHA, U5 + xvfmadd.d D6, D6, VALPHA, U6 + xvfmadd.d D7, D7, VALPHA, U7 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2352,10 +2329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmul.d D0, U0, U4 xvfmul.d D1, U1, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x40 @@ -2366,131 +2343,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_M8_L7 */ beq ZERO,TL, .L_N3_M8_L7 -.L_N3_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-7***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x40 addi.d B0, B0, 0x10 - /***8-8***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + beq ZERO, TL, .L_N3_M8_TL1_END - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 +.L_N3_M8_TL1: /* TL-- */ + KERNEL8x8x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M8_TL1 +.L_N3_M8_TL1_END: + KERNEL8x8x2_END .L_N3_M8_L7: /* if (!(L & 7)) goto L_N3_M8_L0 */ @@ -2505,9 +2376,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 /* Add stride for A0, B0 */ addi.d A0, A0, 0x40 @@ -2530,10 +2401,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D1, D1, VALPHA, U1 /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U2 + xvfmadd.d D5, D5, VALPHA, U3 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2561,162 +2432,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d B0, B0, T0 #endif -#ifdef LEFT - addi.d OFF, OFF, 0x08 -#endif -#endif // #if defined(TRMMKERNEL) - -/********LOOP (if(N & 2) && (M & 8) ) End************/ - -.L_N3_M4: - andi I, M, 4 - beq ZERO,I, .L_N3_M2 - -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x05 - add.d A0, A0, T0 - slli.d T0, OFF, 0x04 - add.d B0, B, T0 -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 4 -#else - /* number of values in B */ - addi.d L, OFF, 2 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 4 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N3_M4_L7 */ - beq ZERO,TL, .L_N3_M4_L7 - -.L_N3_M4_TL1: /* TL-- */ - /***8-1***/ - /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) - /***8-5***/ - xvld U0, A0, 0x00 +/********LOOP (if(N & 2) && (M & 8) ) End************/ - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +.L_N3_M4: + andi I, M, 4 + beq ZERO,I, .L_N3_M2 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - /***8-6***/ + /* Load 4 * 64 from A0 */ xvld U0, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 - /***8-7***/ - xvld U0, A0, 0x00 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x20 addi.d B0, B0, 0x10 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + beq ZERO, TL, .L_N3_M4_TL1_END - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 +.L_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END .L_N3_M4_L7: /* if (!(L & 7)) goto L_N3_M4_L0 */ @@ -2729,8 +2517,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x20 @@ -2749,8 +2537,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2830,106 +2618,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_M2_L7 */ beq ZERO,TL, .L_N3_M2_L7 -.L_N3_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 + xvld U8, A0, 0x00 - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x10 addi.d B0, B0, 0x10 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + beq ZERO, TL, .L_N3_M2_TL1_END - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 +.L_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END .L_N3_M2_L7: /* if (!(L & 7)) goto L_N3_M2_L0 */ @@ -2942,8 +2648,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x10 @@ -2962,8 +2668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) xvstelm.d D0, C0, 0x00, 0x00 @@ -3017,132 +2723,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else /* number of values in B */ addi.d L, OFF, 2 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N3_M1_L7 */ - beq ZERO,TL, .L_N3_M1_L7 - -.L_N3_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - /***8-7***/ + /* Load 1 * 64 from A0 */ xvld U0, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 - /***8-8***/ - xvld U0, A0, 0x00 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x08 addi.d B0, B0, 0x10 + beq ZERO, TL, .L_N3_M1_TL1_END + +.L_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END .L_N3_M1_L7: /* if (!(L & 7)) goto L_N3_M1_L0 */ @@ -3155,8 +2779,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x08 @@ -3175,8 +2799,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) xvstelm.d D0, C0, 0x00, 0x00 @@ -3300,137 +2924,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_L7 */ beq ZERO,TL, .L_N1_L7 -.L_N1_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-3***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-5***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-7***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x80 addi.d B0, B0, 0x08 - /***8-8***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 + beq ZERO, TL, .L_N1_TL1_END +.L_N1_TL1: /* TL-- */ + KERNEL8x16x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x16x1_END .L_N1_L7: /* if (!(L & 7)) goto L_N1_L0 */ @@ -3494,161 +3006,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif slli.d T0, L, 0x07 add.d A0, A0, T0 - slli.d T0, L, 0x03 - add.d B0, B0, T0 -#endif - -#ifdef LEFT - addi.d OFF, OFF, 0x10 -#endif -#endif // #if defined(TRMMKERNEL) - - addi.d I, I, -1 /* I-- */ - blt ZERO,I, .L_N1_I1 - -.L_N1_M8: - /* We have done M & 16, considering M=8/4/2/1 */ - andi I, M, 15 - beq ZERO,I, .L_N1_M0 - - andi I, M, 8 - beq ZERO,I, .L_N1_M4 - -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x06 - add.d A0, A0, T0 - slli.d T0, OFF, 0x03 - add.d B0, B, T0 -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 8 -#else - /* number of values in B */ - addi.d L, OFF, 1 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N1_M8_L7 */ - beq ZERO,TL, .L_N1_M8_L7 - -.L_N1_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif - /***8-5***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 - /***8-6***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + andi I, M, 8 + beq ZERO,I, .L_N1_M4 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - /***8-7***/ + /* Load 8 * 64 from A0 */ xvld U0, A0, 0x00 xvld U1, A0, 0x20 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 - /***8-8***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x40 addi.d B0, B0, 0x08 + beq ZERO, TL, .L_N1_M8_TL1_END +.L_N1_M8_TL1: /* TL-- */ + KERNEL8x8x1 + addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M8_TL1 +.L_N1_M8_TL1_END: + KERNEL8x8x1_END + .L_N1_M8_L7: /* if (!(L & 7)) goto L_N1_M8_L0 */ andi TL, L, 7 @@ -3753,81 +3191,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M4_L7 */ beq ZERO,TL, .L_N1_M4_L7 -.L_N1_M4_TL1: /* TL-- */ - /***8-1***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x20 addi.d B0, B0, 0x08 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + beq ZERO, TL, .L_N1_M4_TL1_END - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 +.L_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END .L_N1_M4_L7: /* if (!(L & 7)) goto L_N1_M4_L0 */ @@ -3927,82 +3307,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M2_L7 */ beq ZERO,TL, .L_N1_M2_L7 -.L_N1_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x10 addi.d B0, B0, 0x08 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + beq ZERO, TL, .L_N1_M2_TL1_END - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 +.L_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END .L_N1_M2_L7: /* if (!(L & 7)) goto L_N1_M2_L0 */ @@ -4101,82 +3422,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M1_L7 */ beq ZERO,TL, .L_N1_M1_L7 -.L_N1_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x08 addi.d B0, B0, 0x08 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + beq ZERO, TL, .L_N1_M1_TL1_END - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END .L_N1_M1_L7: /* if (!(L & 7)) goto L_N1_M1_L0 */ @@ -4243,7 +3505,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDARG $r26, $sp, 24 LDARG $r27, $sp, 32 LD $f23, $sp, 40 - addi.d $sp, $sp, 56 + LD $f24, $sp, 48 + LD $f25, $sp, 56 + LD $f26, $sp, 64 + LD $f27, $sp, 72 + LD $f28, $sp, 80 + LD $f29, $sp, 88 + LD $f30, $sp, 96 + LD $f31, $sp, 104 + addi.d $sp, $sp, 120 jirl $r0, $r1, 0x0 From e8b571d245d57fb143c25fadda18fcc01a84b3f5 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 11 Jul 2023 10:01:12 +0800 Subject: [PATCH 258/718] LoongArch64: Add dgemv_t_8_lasx.S and dgemv_n_8_lasx.S V2 --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/KERNEL.generic | 4 + kernel/loongarch64/dgemv_n_8_lasx.S | 546 ++++++++++++++++++++++++++ kernel/loongarch64/dgemv_t_8_lasx.S | 468 ++++++++++++++++++++++ kernel/loongarch64/loongarch64_asm.S | 313 +++++++++++++++ 5 files changed, 1334 insertions(+) create mode 100644 kernel/loongarch64/dgemv_n_8_lasx.S create mode 100644 kernel/loongarch64/dgemv_t_8_lasx.S create mode 100644 kernel/loongarch64/loongarch64_asm.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 253aa2464..9d858584c 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -8,6 +8,9 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMVNKERNEL = dgemv_n_8_lasx.S +DGEMVTKERNEL = dgemv_t_8_lasx.S endif DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic index b772a6f82..213add9ee 100644 --- a/kernel/loongarch64/KERNEL.generic +++ b/kernel/loongarch64/KERNEL.generic @@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c +ifndef DGEMVNKERNEL DGEMVNKERNEL = ../arm/gemv_n.c +endif CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c +ifndef DGEMVTKERNEL DGEMVTKERNEL = ../arm/gemv_t.c +endif CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S new file mode 100644 index 000000000..940d27569 --- /dev/null +++ b/kernel/loongarch64/dgemv_n_8_lasx.S @@ -0,0 +1,546 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/07/14 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define A0 $xr12 +#define A1 $xr13 +#define A2 $xr14 +#define A3 $xr15 +#define A4 $xr16 +#define A5 $xr17 +#define A6 $xr18 +#define A7 $xr19 +#define A8 $xr20 +#define A9 $xr21 +#define A10 $xr22 +#define A11 $xr23 +#define A12 $xr24 +#define A13 $xr25 +#define A14 $xr26 +#define A15 $xr27 + +.macro DLOAD_X_8 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ + X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro DLOAD_X_4 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro DLOAD_X_2 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro DLOAD_X_1 + GLDREPL xv, d, X0, X, 0x00 + GMUL xvf, d, X0, X0, VALPHA +.endm + +.macro DLOAD_Y_8 + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro DLOAD_Y_4 + GLD xv, , Y0, Y, 0 +.endm + +.macro DLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro DSTORE_Y_8 + GST xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro DSTORE_Y_4 + GST xv, , Y0, Y, 0 +.endm + +.macro DSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +// Unable to use vector load/store ins +.macro DLOAD_Y_8_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + PTR_ALSL T0, INC_Y, Y, 2 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + PTR_ADD T0, T0, INC_Y + PTR_ADD T0, T0, INC_Y + fld.d $f18, T0, 0 + fldx.d $f19, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 +.endm + +.macro DLOAD_Y_4_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 +.endm + +.macro DSTORE_Y_8_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 + + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 3 +.endm + +.macro DSTORE_Y_4_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 +.endm + +.macro DLOAD_X_8_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X4, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X5, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X6, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X7, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro DLOAD_X_4_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro DLOAD_X_2_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro DGEMV_N_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ + Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ + Y0, A6, X3, Y0, Y1, A7, X3, Y1, \ + Y0, A8, X4, Y0, Y1, A9, X4, Y1, \ + Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ + Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ + Y0, A14, X7, Y0, Y1, A15, X7, Y1 +.endm + +.macro DGEMV_N_4x8 + GLD_INC xv, , 0x20, A0, PA0, 0, \ + A2, PA1, 0, \ + A4, PA2, 0, \ + A6, PA3, 0, \ + A8, PA4, 0, \ + A10, PA5, 0, \ + A12, PA6, 0, \ + A14, PA7, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, \ + Y0, A2, X1, Y0, \ + Y0, A4, X2, Y0, \ + Y0, A6, X3, Y0, \ + Y0, A8, X4, Y0, \ + Y0, A10, X5, Y0, \ + Y0, A12, X6, Y0, \ + Y0, A14, X7, Y0 +.endm + +.macro DGEMV_N_1x8 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ + $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 + GMADD f, d, $f10, $f12, $f2, $f10, \ + $f10, $f14, $f3, $f10, \ + $f10, $f16, $f4, $f10, \ + $f10, $f18, $f5, $f10, \ + $f10, $f20, $f6, $f10, \ + $f10, $f22, $f7, $f10, \ + $f10, $f24, $f8, $f10, \ + $f10, $f26, $f9, $f10, +.endm + +.macro DGEMV_N_8x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ + Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ + Y0, A6, X3, Y0, Y1, A7, X3, Y1 +.endm + +.macro DGEMV_N_4x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \ + Y0, A4, X2, Y0, Y0, A6, X3, Y0 +.endm + +.macro DGEMV_N_1x4 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 + GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ + $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 +.endm + +.macro DGEMV_N_8x2 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0 + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1 +.endm + +.macro DGEMV_N_4x2 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 + GMADD xvf, d, Y0, A0, X0, Y0, \ + Y0, A2, X1, Y0 +.endm + +.macro DGEMV_N_1x2 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 + GMADD f, d, $f10, $f12, $f2, $f10, \ + $f10, $f14, $f3, $f10 +.endm + +.macro DGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + fmadd.d $f10, $f12, $f2, $f10 +.endm + +.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + DLOAD_\X_8 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x8 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 3 + PTR_ADDI K, K, 8 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_M_3 + DLOAD_\Y_4 + DGEMV_N_4x8 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x8 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + PTR_ALSL X, INC_X, X, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + DLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x4 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_4_M_3 + DLOAD_\Y_4 + DGEMV_N_4x4 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_N_4_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x4 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + DLOAD_\X_2 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x2 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_2_M_3 + DLOAD_\Y_4 + DGEMV_N_4x2 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_N_2_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x2 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD PA1, PA1, K_LDA + PTR_ALSL X, INC_X, X, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + DLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x1 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 24 + 4 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + xvreplve0.d VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 24 + 4 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S new file mode 100644 index 000000000..be90cb1af --- /dev/null +++ b/kernel/loongarch64/dgemv_t_8_lasx.S @@ -0,0 +1,468 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/07/17 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define X1 $xr2 +#define A0 $xr3 +#define A1 $xr4 +#define A2 $xr5 +#define A3 $xr6 +#define A4 $xr7 +#define A5 $xr8 +#define A6 $xr9 +#define A7 $xr10 +#define A8 $xr11 +#define A9 $xr12 +#define A10 $xr13 +#define A11 $xr14 +#define A12 $xr15 +#define A13 $xr16 +#define A14 $xr17 +#define A15 $xr18 +#define TP0 $xr19 +#define TP1 $xr20 +#define TP2 $xr21 +#define TP3 $xr22 +#define TP4 $xr23 +#define TP5 $xr24 +#define TP6 $xr25 +#define TP7 $xr26 +#define Y0 $xr3 +#define Y1 $xr4 +#define Y2 $xr5 +#define Y3 $xr6 +#define Y4 $xr7 +#define Y5 $xr8 +#define Y6 $xr9 +#define Y7 $xr10 + +.macro ZERO_Y8 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ + TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 +.endm + +.macro ZERO_Y4 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y2 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro DLOAD_X8 + GLD xv, , X0, X, 0x00, X1, X, 0x20 +.endm + +.macro DLOAD_X4 + GLD xv, , X0, X, 0x00 +.endm + +.macro DLOAD_X8_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 + PTR_ALSL T0, INC_X, X, 2 + fld.d $f2, T0, 0x00 + fldx.d $f3, T0, INC_X + PTR_ALSL T0, INC_X, T0, 1 + fld.d $f4, T0, 0x00 + fldx.d $f5, T0, INC_X + GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 +.endm + +.macro DLOAD_X4_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 +.endm + +.macro DGEMV_T_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ + TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ + TP3, A6, X0, TP3, TP3, A7, X1, TP3, \ + TP4, A8, X0, TP4, TP4, A9, X1, TP4, \ + TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ + TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ + TP7, A14, X0, TP7, TP7, A15, X1, TP7 +.endm + +.macro DGEMV_T_8x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \ + A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ + TP2, A4, X0, TP2, TP3, A6, X0, TP3, \ + TP4, A8, X0, TP4, TP5, A10, X0, TP5, \ + TP6, A12, X0, TP6, TP7, A14, X0, TP7, +.endm + +.macro DGEMV_T_4x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ + TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ + TP3, A6, X0, TP3, TP3, A7, X1, TP3 +.endm + +.macro DGEMV_T_4x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ + TP2, A4, X0, TP2, TP3, A6, X0, TP3 +.endm + +.macro DGEMV_T_2x8 + GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1 +.endm + +.macro DGEMV_T_2x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 +.endm + +.macro DGEMV_T XW:req X8:req, X4:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + ZERO_Y8 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + DLOAD_\X8 + DGEMV_T_8x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_M_3 + DLOAD_\X4 + DGEMV_T_8x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ + Y5, TP5, Y6, TP6, Y7, TP7 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 + fld.d $f15, PA4, 0x00 + fld.d $f16, PA5, 0x00 + fld.d $f17, PA6, 0x00 + fld.d $f18, PA7, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#else + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#endif + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ + $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + PTR_ALSL PY1, INC_Y, Y, 2 + fld.d $f15, PY1, 0x00 + fldx.d $f16, PY1, INC_Y + PTR_ALSL PY2, INC_Y, PY1, 1 + fld.d $f17, PY2, 0x00 + fldx.d $f18, PY2, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ + $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + fst.d $f15, PY1, 0x00 + fstx.d $f16, PY1, INC_Y + fst.d $f17, PY2, 0x00 + fstx.d $f18, PY2, INC_Y + PTR_ALSL Y, INC_Y, Y, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + DLOAD_\X8 + DGEMV_T_4x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_4_M_3 + DLOAD_\X4 + DGEMV_T_4x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_4_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + fld.d $f1, X, 0x00 + GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 + + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + ZERO_Y2 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + DLOAD_\X8 + DGEMV_T_2x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_2_M_3 + DLOAD_\X4 + DGEMV_T_2x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_2_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1 + andi I, M, 3 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + fld.d $f1, X, 0x00 + GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 + + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M8 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + PTR_ALSL Y, INC_Y, Y, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + fmadd.d $f19, $f3, $f1, $f19 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 + fld.d $f3, Y, 0x00 + fmadd.d $f3, ALPHA, $f19, $f3 + fst.d $f3, Y, 0x00 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 24 + 3 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + xvreplve0.d VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + DGEMV_T GAP_0, X8, X4 +.L_GAP_1: /* if (incx != 1) */ + DGEMV_T GAP_1, X8_GAP, X4_GAP +.L_END: + pop_if_used 17 + 8, 24 + 3 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S new file mode 100644 index 000000000..8876cbed9 --- /dev/null +++ b/kernel/loongarch64/loongarch64_asm.S @@ -0,0 +1,313 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#if __loongarch_grlen == 64 +#define LA_REG int64_t +#define REG_SIZE 8 +#define REG_LOG 3 +#define PTR_ADDI addi.d +#define PTR_ADD add.d +#define PTR_SUB sub.d +#define PTR_LD ld.d +#define PTR_ST st.d +#define PTR_SLLI slli.d +#define PTR_SRLI srli.d +#define PTR_ALSL alsl.d +#else +#define LA_REG int32_t +#define REG_SIZE 4 +#define REG_LOG 2 +#define PTR_ADDI addi.w +#define PTR_ADD add.w +#define PTR_SUB sub.w +#define PTR_LD ld.w +#define PTR_ST st.w +#define PTR_SLLI slli.w +#define PTR_SRLI srli.w +#define PTR_ALSL alsl.w +#endif + +#if __loongarch_frlen == 64 +#define FREG_SIZE 8 +#define FREG_LOG 3 +#define PTR_FLD fld.d +#define PTR_FST fst.d +#else +#define FREG_SIZE 4 +#define FREG_LOG 2 +#define PTR_FLD fld.s +#define PTR_FST fst.s +#endif + +// The max registers available to the user which +// do not need to be preserved across calls. +// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html +#define MAX_INT_CALLER_SAVED 17 +#define MAX_FP_CALLER_SAVED 24 + +.altmacro // Enable alternate macro mode + +.macro push_if_used regs, fregs +.if \regs > MAX_INT_CALLER_SAVED + PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) + push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 +.endif +.if \fregs > MAX_FP_CALLER_SAVED + PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) + push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 +.endif +.endm // End push_if_used +.macro pop_if_used regs, fregs +.if \fregs > MAX_FP_CALLER_SAVED + pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 + PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG +.endif +.if \regs > MAX_INT_CALLER_SAVED + pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 + PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG +.endif +.endm // End pop_if_used +.macro push_regs from, to + PTR_ST $s\()\from, $sp, \from << REG_LOG +.if \to - \from + push_regs %from + 1, \to +.endif +.endm // End push_regs +.macro pop_regs from, to + PTR_LD $s\()\from, $sp, \from << REG_LOG +.if \to - \from + pop_regs %from + 1, \to +.endif +.endm // End pop_regs +.macro push_fregs from, to + PTR_FST $fs\()\from, $sp, \from << FREG_LOG +.if \to - \from + push_fregs %from + 1, \to +.endif +.endm // End push_fregs +.macro pop_fregs from, to + PTR_FLD $fs\()\from, $sp, \from << FREG_LOG +.if \to - \from + pop_fregs %from + 1, \to +.endif +.endm // End pop_fregs + +// +// Instruction Related Macros +// +// GLD +// +.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ld \out, \src, \offset +.else + \pre_op\()ld.\suf_op \out, \src, \offset +.endif +.ifnb \more + GLD \pre_op, \suf_op, \more +.endif +.endm + +// +// GLD_INC +// +.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ld \out, \src, \offset +.else + \pre_op\()ld.\suf_op \out, \src, \offset +.endif + PTR_ADDI \src, \src, \inc +.ifnb \more + GLD_INC \pre_op, \suf_op, \inc, \more +.endif +.endm +// +// GLDX is same as GLD except the stride is a register +// +.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ldx \out, \src, \offset +.else + \pre_op\()ldx.\suf_op \out, \src, \offset +.endif +.ifnb \more + GLDX \pre_op, \suf_op, \more +.endif +.endm +// +// GLDREPL +// +.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg + \pre_op\()ldrepl.\suf_op \out, \src, \offset +.ifnb \more + GLDREPL \pre_op, \suf_op, \more +.endif +.endm +// +// GST +// +.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()st \src, \dst, \offset +.else + \pre_op\()st.\suf_op \src, \dst, \offset +.endif +.ifnb \more + GST \pre_op, \suf_op, \more +.endif +.endm +// +// GMUL +// +.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()mul.\suf_op \out, \in0, \in1 +.ifnb \more + GMUL \pre_op, \suf_op, \more +.endif +.endm +// +// GMADD +// +.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg + \pre_op\()madd.\suf_op \out, \in0, \in1, \in2 +.ifnb \more + GMADD \pre_op, \suf_op, \more +.endif +.endm +// +// GADD +// +.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()add.\suf_op \out, \in0, \in1 +.ifnb \more + GADD \pre_op, \suf_op, \more +.endif +.endm +// +// GADDI +// +.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()addi.\suf_op \out, \in0, \in1 +.ifnb \more + GADDI \pre_op, \suf_op, \more +.endif +.endm +// +// GSLLI +// +.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()slli.\suf_op \out, \in0, \in1 +.ifnb \more + GSLLI \pre_op, \suf_op, \more +.endif +.endm +// +// GINSVE0 +// +.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()insve0.\suf_op \out, \in0, \in1 +.ifnb \more + GINSVE0 \pre_op, \suf_op, \more +.endif +.endm +// +// GXOR +// +.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()xor.\suf_op \out, \in0, \in1 +.ifnb \more + GXOR \pre_op, \suf_op, \more +.endif +.endm + +// +// Compound instructions +// +// GACC: Accumulate the values of vector registers +// +.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifeqs "\suf_op", "s" + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "vf" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifeqs "\suf_op", "s" + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "xv" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "d" + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "w" + xvpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "h" + xvpackod.b \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif +.endif +.endif + +.ifeqs "\pre_op", "v" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "d" + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "w" + vpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "h" + vpackod.b \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif +.endif +.endif + +.ifnb \more + GACC \pre_op, \suf_op, \more +.endif +.endm From 4d0f000db63fedbe62ca318bb7bcd5d3152fc4a2 Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 7 Aug 2023 16:55:59 +0800 Subject: [PATCH 259/718] MIPS: Enable MSA --- kernel/mips/KERNEL.P5600 | 34 ++++++++++----------- kernel/mips64/KERNEL.LOONGSON3R4 | 32 ++++++++++---------- param.h | 52 ++++++++++++++++---------------- 3 files changed, 59 insertions(+), 59 deletions(-) diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 9a6e06d67..2e1e3390d 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -35,7 +35,7 @@ DSUMKERNEL = ../mips/sum.c CSUMKERNEL = ../mips/zsum.c ZSUMKERNEL = ../mips/zsum.c -ifdef HAVE_MSA +ifndef NO_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c @@ -47,7 +47,7 @@ CASUMKERNEL = ../mips/zasum.c ZASUMKERNEL = ../mips/zasum.c endif -ifdef HAVE_MSA +ifndef NO_MSA SAXPYKERNEL = ../mips/saxpy_msa.c DAXPYKERNEL = ../mips/daxpy_msa.c CAXPYKERNEL = ../mips/caxpy_msa.c @@ -59,7 +59,7 @@ CAXPYKERNEL = ../mips/zaxpy.c ZAXPYKERNEL = ../mips/zaxpy.c endif -ifdef HAVE_MSA +ifndef NO_MSA SCOPYKERNEL = ../mips/scopy_msa.c DCOPYKERNEL = ../mips/dcopy_msa.c CCOPYKERNEL = ../mips/ccopy_msa.c @@ -71,7 +71,7 @@ CCOPYKERNEL = ../mips/zcopy.c ZCOPYKERNEL = ../mips/zcopy.c endif -ifdef HAVE_MSA +ifndef NO_MSA SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c @@ -88,7 +88,7 @@ DNRM2KERNEL = ../mips/nrm2.c CNRM2KERNEL = ../mips/znrm2.c ZNRM2KERNEL = ../mips/znrm2.c -ifdef HAVE_MSA +ifndef NO_MSA SROTKERNEL = ../mips/srot_msa.c DROTKERNEL = ../mips/drot_msa.c CROTKERNEL = ../mips/crot_msa.c @@ -100,7 +100,7 @@ CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c CSCALKERNEL = ../mips/cscal_msa.c @@ -112,7 +112,7 @@ CSCALKERNEL = ../mips/zscal.c ZSCALKERNEL = ../mips/zscal.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSWAPKERNEL = ../mips/sswap_msa.c DSWAPKERNEL = ../mips/dswap_msa.c CSWAPKERNEL = ../mips/cswap_msa.c @@ -124,7 +124,7 @@ CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c CGEMVNKERNEL = ../mips/cgemv_n_msa.c @@ -136,7 +136,7 @@ CGEMVNKERNEL = ../mips/zgemv_n.c ZGEMVNKERNEL = ../mips/zgemv_n.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVTKERNEL = ../mips/sgemv_t_msa.c DGEMVTKERNEL = ../mips/dgemv_t_msa.c CGEMVTKERNEL = ../mips/cgemv_t_msa.c @@ -148,7 +148,7 @@ CGEMVTKERNEL = ../mips/zgemv_t.c ZGEMVTKERNEL = ../mips/zgemv_t.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -162,7 +162,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c @@ -180,7 +180,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c @@ -198,7 +198,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c @@ -212,7 +212,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o endif -ifdef HAVE_MSA +ifndef NO_MSA STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c @@ -224,7 +224,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c @@ -236,7 +236,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -248,7 +248,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4 index b81e5441d..1149d97f1 100644 --- a/kernel/mips64/KERNEL.LOONGSON3R4 +++ b/kernel/mips64/KERNEL.LOONGSON3R4 @@ -1,4 +1,4 @@ -ifdef HAVE_MSA +ifndef NO_MSA SAXPYKERNEL = ../mips/saxpy_msa.c DAXPYKERNEL = ../mips/daxpy_msa.c CAXPYKERNEL = ../mips/caxpy_msa.c @@ -8,14 +8,14 @@ SAXPYKERNEL = axpy_loongson3a.S DAXPYKERNEL = daxpy_loongson3a_simd.S endif -ifdef HAVE_MSA +ifndef NO_MSA SCOPYKERNEL = ../mips/scopy_msa.c DCOPYKERNEL = ../mips/dcopy_msa.c CCOPYKERNEL = ../mips/ccopy_msa.c ZCOPYKERNEL = ../mips/zcopy_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SDOTKERNEL = ../mips/sdot_msa.c DDOTKERNEL = ../mips/ddot_msa.c CDOTKERNEL = ../mips/cdot_msa.c @@ -23,21 +23,21 @@ ZDOTKERNEL = ../mips/zdot_msa.c endif DSDOTKERNEL = ../mips/dot.c -ifdef HAVE_MSA +ifndef NO_MSA SROTKERNEL = ../mips/srot_msa.c DROTKERNEL = ../mips/drot_msa.c CROTKERNEL = ../mips/crot_msa.c ZROTKERNEL = ../mips/zrot_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c CSCALKERNEL = ../mips/cscal_msa.c ZSCALKERNEL = ../mips/zscal_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c DGEMVNKERNEL = ../mips/dgemv_n_msa.c SGEMVTKERNEL = ../mips/sgemv_t_msa.c @@ -57,21 +57,21 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c ZGEMVTKERNEL = zgemv_t_loongson3a.c endif -ifdef HAVE_MSA +ifndef NO_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c CASUMKERNEL = ../mips/casum_msa.c ZASUMKERNEL = ../mips/zasum_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SSWAPKERNEL = ../mips/sswap_msa.c DSWAPKERNEL = ../mips/dswap_msa.c CSWAPKERNEL = ../mips/cswap_msa.c ZSWAPKERNEL = ../mips/zswap_msa.c endif -ifdef HAVE_MSA +ifndef NO_MSA SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c @@ -89,7 +89,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c @@ -107,7 +107,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c @@ -129,7 +129,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c @@ -143,7 +143,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -ifdef HAVE_MSA +ifndef NO_MSA STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c @@ -155,7 +155,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c @@ -167,7 +167,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -179,7 +179,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif -ifdef HAVE_MSA +ifndef NO_MSA ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/param.h b/param.h index 547463b2f..a4672ce07 100644 --- a/param.h +++ b/param.h @@ -2748,19 +2748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL -#ifdef HAVE_MSA -#define SGEMM_DEFAULT_UNROLL_M 8 -#define SGEMM_DEFAULT_UNROLL_N 8 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 -#else +#if defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2772,6 +2760,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #endif #define SGEMM_DEFAULT_P 64 @@ -2961,19 +2961,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL -#if defined(HAVE_MSA) -#define SGEMM_DEFAULT_UNROLL_M 8 -#define SGEMM_DEFAULT_UNROLL_N 8 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 -#else +#if defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2985,6 +2973,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 #endif #define SGEMM_DEFAULT_P 128 From 4cc804c75434ffba0dd6cdd4f5ed4b332f525d77 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Aug 2023 16:13:23 +0200 Subject: [PATCH 260/718] Prepare for INCX < 0 in new NRM2 implementation from BLAS 3.10 --- interface/nrm2.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/interface/nrm2.c b/interface/nrm2.c index dc8c08e9a..331ebc3d0 100644 --- a/interface/nrm2.c +++ b/interface/nrm2.c @@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ if (n <= 0) return 0.; +#ifndef COMPLEX + if (n == 1) +#ifdef DOUBLE + return fabs(x[0]); +#else + return fabsf(x[0]); +#endif +#endif + + if (incx < 0) +#ifdef COMPLEX + x -= (n - 1) * incx * 2; +#else + x -= (n - 1) * incx; +#endif IDEBUG_START; FUNCTION_PROFILE_START(); @@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ if (n <= 0) return 0.; + #ifndef COMPLEX + if (n == 1) +#ifdef DOUBLE + return fabs(x[0]); +#else + return fabsf(x[0]); +#endif +#endif + + if (incx < 0) +#ifdef COMPLEX + x -= (n - 1) * incx * 2; +#else + x -= (n - 1) * incx; +#endif + IDEBUG_START; FUNCTION_PROFILE_START(); From a2d867f4d1257c4c47b5b7afacbbc554182885bf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Aug 2023 16:49:05 +0200 Subject: [PATCH 261/718] Allow negative iNCX (API change from version 3.10 of the reference implementation) --- kernel/arm/nrm2.c | 2 +- kernel/arm/znrm2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c index fcff09337..8cc189fe3 100644 --- a/kernel/arm/nrm2.c +++ b/kernel/arm/nrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT absxi = 0.0; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; diff --git a/kernel/arm/znrm2.c b/kernel/arm/znrm2.c index fc1c8b54a..28bb0eda5 100644 --- a/kernel/arm/znrm2.c +++ b/kernel/arm/znrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); inc_x2 = 2 * inc_x; From d15ffb7fdfce1609c0fddd813bc6c4236150c445 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Aug 2023 16:50:44 +0200 Subject: [PATCH 262/718] Allow negative INCX (API change from version 3.10 of the reference implementation) --- kernel/loongarch64/cnrm2.S | 2 +- kernel/loongarch64/dnrm2.S | 2 +- kernel/loongarch64/snrm2.S | 2 +- kernel/loongarch64/znrm2.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S index 9d27987e1..41667485a 100644 --- a/kernel/loongarch64/cnrm2.S +++ b/kernel/loongarch64/cnrm2.S @@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov.d s2, s1 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT - bge $r0, INCX, .L999 + beq $r0, INCX, .L999 srai.d I, N, 2 bge $r0, I, .L25 LD a1, X, 0 * SIZE diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S index ff937ae53..2160b93a6 100644 --- a/kernel/loongarch64/dnrm2.S +++ b/kernel/loongarch64/dnrm2.S @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MTC s1, $r0 bge $r0, N, .L999 slli.d INCX, INCX, BASE_SHIFT - bge $r0, INCX, .L999 + beq $r0, INCX, .L999 move XX, X NOP LD a1, X, 0 * SIZE diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S index 57c21a017..8c5c91ade 100644 --- a/kernel/loongarch64/snrm2.S +++ b/kernel/loongarch64/snrm2.S @@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov.d s2, s1 bge $r0, N, .L999 slli.d INCX, INCX, BASE_SHIFT - bge $r0, INCX, .L999 + beq $r0, INCX, .L999 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L15 diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S index 49f640268..8e2165ab7 100644 --- a/kernel/loongarch64/znrm2.S +++ b/kernel/loongarch64/znrm2.S @@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MTC s1, $r0 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT - bge $r0, INCX, .L999 + beq $r0, INCX, .L999 move XX, X MOV s2, s1 srai.d I, N, 2 From f6921787921862a744e4366b7a42ffcab1993177 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Aug 2023 16:52:09 +0200 Subject: [PATCH 263/718] Allow negative INCX (API change from version 3.10 of the reference implementation) --- kernel/mips/nrm2.c | 2 +- kernel/mips/znrm2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/mips/nrm2.c b/kernel/mips/nrm2.c index fcff09337..8cc189fe3 100644 --- a/kernel/mips/nrm2.c +++ b/kernel/mips/nrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT absxi = 0.0; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; diff --git a/kernel/mips/znrm2.c b/kernel/mips/znrm2.c index 85be39cd1..d11a6bd4a 100644 --- a/kernel/mips/znrm2.c +++ b/kernel/mips/znrm2.c @@ -48,7 +48,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); inc_x2 = 2 * inc_x; From 7dd441d5db84c21723256976f275e2b3bf59533c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Aug 2023 16:53:33 +0200 Subject: [PATCH 264/718] Allow negative INCX (API change from version 3.10 of the reference implementation) --- kernel/mips64/cnrm2.S | 2 +- kernel/mips64/dnrm2.S | 2 +- kernel/mips64/snrm2.S | 2 +- kernel/mips64/znrm2.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/mips64/cnrm2.S b/kernel/mips64/cnrm2.S index 76fa9c295..159f9bea9 100644 --- a/kernel/mips64/cnrm2.S +++ b/kernel/mips64/cnrm2.S @@ -77,7 +77,7 @@ blez N, .L999 mov.d s2, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT dsra I, N, 2 diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S index cd40414a2..1b55d9fc3 100644 --- a/kernel/mips64/dnrm2.S +++ b/kernel/mips64/dnrm2.S @@ -81,7 +81,7 @@ blez N, .L999 MTC $0, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, BASE_SHIFT move XX, X diff --git a/kernel/mips64/snrm2.S b/kernel/mips64/snrm2.S index 1ba061a7d..f18151b5c 100644 --- a/kernel/mips64/snrm2.S +++ b/kernel/mips64/snrm2.S @@ -77,7 +77,7 @@ blez N, .L999 mov.d s2, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 diff --git a/kernel/mips64/znrm2.S b/kernel/mips64/znrm2.S index 1c247bca9..d33284a47 100644 --- a/kernel/mips64/znrm2.S +++ b/kernel/mips64/znrm2.S @@ -80,7 +80,7 @@ blez N, .L999 MTC $0, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT move XX, X From 54d3246fc6f8282539bb9dd2be996f65ebb5a51e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Aug 2023 16:55:17 +0200 Subject: [PATCH 265/718] Allow negative INCX (API change from version 3.10 of the reference implementation) --- kernel/power/cnrm2.S | 2 +- kernel/power/cnrm2_hummer.S | 2 +- kernel/power/cnrm2_ppc440.S | 2 +- kernel/power/dnrm2_hummer.S | 2 +- kernel/power/dnrm2_ppc440.S | 2 +- kernel/power/nrm2.S | 2 +- kernel/power/snrm2.S | 2 +- kernel/power/snrm2_hummer.S | 2 +- kernel/power/snrm2_ppc440.S | 2 +- kernel/power/znrm2.S | 2 +- kernel/power/znrm2_hummer.S | 2 +- kernel/power/znrm2_ppc440.S | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/power/cnrm2.S b/kernel/power/cnrm2.S index c115650fd..74117a831 100644 --- a/kernel/power/cnrm2.S +++ b/kernel/power/cnrm2.S @@ -99,7 +99,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) fmr f0, f1 fmr f2, f1 diff --git a/kernel/power/cnrm2_hummer.S b/kernel/power/cnrm2_hummer.S index 46c29c654..0d036b32f 100644 --- a/kernel/power/cnrm2_hummer.S +++ b/kernel/power/cnrm2_hummer.S @@ -119,7 +119,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) andi. r0, X, 2 * SIZE - 1 bne LL(100) diff --git a/kernel/power/cnrm2_ppc440.S b/kernel/power/cnrm2_ppc440.S index c71c34b7c..8e3abf9f9 100644 --- a/kernel/power/cnrm2_ppc440.S +++ b/kernel/power/cnrm2_ppc440.S @@ -104,7 +104,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) fmr f0, f1 sub X, X, INCX diff --git a/kernel/power/dnrm2_hummer.S b/kernel/power/dnrm2_hummer.S index 4931f5ab1..8638ca424 100644 --- a/kernel/power/dnrm2_hummer.S +++ b/kernel/power/dnrm2_hummer.S @@ -134,7 +134,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) mr XX, X diff --git a/kernel/power/dnrm2_ppc440.S b/kernel/power/dnrm2_ppc440.S index 849ca1f35..529f6adf0 100644 --- a/kernel/power/dnrm2_ppc440.S +++ b/kernel/power/dnrm2_ppc440.S @@ -111,7 +111,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) mr NN, N mr XX, X diff --git a/kernel/power/nrm2.S b/kernel/power/nrm2.S index d9e1f4e9a..880b5d1b4 100644 --- a/kernel/power/nrm2.S +++ b/kernel/power/nrm2.S @@ -113,7 +113,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) mr NN, N mr XX, X diff --git a/kernel/power/snrm2.S b/kernel/power/snrm2.S index be974cc48..696d404bb 100644 --- a/kernel/power/snrm2.S +++ b/kernel/power/snrm2.S @@ -97,7 +97,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) fmr f0, f1 fmr f2, f1 diff --git a/kernel/power/snrm2_hummer.S b/kernel/power/snrm2_hummer.S index a0ff3d1b2..a4292af78 100644 --- a/kernel/power/snrm2_hummer.S +++ b/kernel/power/snrm2_hummer.S @@ -119,7 +119,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) cmpwi cr0, INCX, SIZE bne LL(100) diff --git a/kernel/power/snrm2_ppc440.S b/kernel/power/snrm2_ppc440.S index 0a80d1224..3547d7f47 100644 --- a/kernel/power/snrm2_ppc440.S +++ b/kernel/power/snrm2_ppc440.S @@ -105,7 +105,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) fmr f0, f1 fmr f2, f1 diff --git a/kernel/power/znrm2.S b/kernel/power/znrm2.S index 60f379d25..3048e3480 100644 --- a/kernel/power/znrm2.S +++ b/kernel/power/znrm2.S @@ -105,7 +105,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) mr NN, N mr XX, X diff --git a/kernel/power/znrm2_hummer.S b/kernel/power/znrm2_hummer.S index 1d0c598f8..4ef2212df 100644 --- a/kernel/power/znrm2_hummer.S +++ b/kernel/power/znrm2_hummer.S @@ -134,7 +134,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) mr XX, X diff --git a/kernel/power/znrm2_ppc440.S b/kernel/power/znrm2_ppc440.S index 778b805de..f775c3e62 100644 --- a/kernel/power/znrm2_ppc440.S +++ b/kernel/power/znrm2_ppc440.S @@ -112,7 +112,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) mr NN, N mr XX, X From a34a0a7abceb6682c841c1dbd43728005ff7b940 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Aug 2023 16:56:52 +0200 Subject: [PATCH 266/718] Allow negative INCX (API change from version 3.10 of the reference implementation) --- kernel/riscv64/nrm2.c | 2 +- kernel/riscv64/znrm2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/riscv64/nrm2.c b/kernel/riscv64/nrm2.c index fcff09337..8cc189fe3 100644 --- a/kernel/riscv64/nrm2.c +++ b/kernel/riscv64/nrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT absxi = 0.0; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c index fc1c8b54a..28bb0eda5 100644 --- a/kernel/riscv64/znrm2.c +++ b/kernel/riscv64/znrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); inc_x2 = 2 * inc_x; From c211da0688ddead349a14dfb26a760c02280b433 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Aug 2023 16:58:57 +0200 Subject: [PATCH 267/718] Allow negative INCX (API change from version 3.10 of the reference implementation) --- kernel/sparc/cnrm2.S | 2 +- kernel/sparc/dnrm2.S | 2 +- kernel/sparc/snrm2.S | 2 +- kernel/sparc/znrm2.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sparc/cnrm2.S b/kernel/sparc/cnrm2.S index 8dc4b56b6..0840c8848 100644 --- a/kernel/sparc/cnrm2.S +++ b/kernel/sparc/cnrm2.S @@ -76,7 +76,7 @@ FMOV c1, t4 cmp INCX, 0 - ble .LL20 + beq .LL20 sll INCX, ZBASE_SHIFT, INCX cmp N, 0 diff --git a/kernel/sparc/dnrm2.S b/kernel/sparc/dnrm2.S index cf7522953..41e993440 100644 --- a/kernel/sparc/dnrm2.S +++ b/kernel/sparc/dnrm2.S @@ -107,7 +107,7 @@ FMOV fzero, c1 cmp INCX, 0 - ble .LL99 + beq .LL99 sll INCX, BASE_SHIFT, INCX add %sp, -8, %sp diff --git a/kernel/sparc/snrm2.S b/kernel/sparc/snrm2.S index a80247259..a7405b6e1 100644 --- a/kernel/sparc/snrm2.S +++ b/kernel/sparc/snrm2.S @@ -76,7 +76,7 @@ FMOV c1, t4 cmp INCX, 0 - ble .LL20 + beq .LL20 sll INCX, BASE_SHIFT, INCX cmp N, 0 diff --git a/kernel/sparc/znrm2.S b/kernel/sparc/znrm2.S index 065d22784..dae53ffe7 100644 --- a/kernel/sparc/znrm2.S +++ b/kernel/sparc/znrm2.S @@ -107,7 +107,7 @@ FMOV fzero, c1 cmp INCX, 0 - ble .LL99 + beq .LL99 sll INCX, ZBASE_SHIFT, INCX add %sp, -8, %sp From 07e32c4cb8044219c588dfd32c33cef28b6cfdd4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Aug 2023 17:00:18 +0200 Subject: [PATCH 268/718] Allow negative INCX (API change from version 3.10 of the reference implementation) --- kernel/x86/nrm2.S | 2 +- kernel/x86/nrm2_sse.S | 2 +- kernel/x86/znrm2.S | 2 +- kernel/x86/znrm2_sse.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86/nrm2.S b/kernel/x86/nrm2.S index 7a14da862..3a6417462 100644 --- a/kernel/x86/nrm2.S +++ b/kernel/x86/nrm2.S @@ -78,7 +78,7 @@ testl M, M jle .L999 testl INCX, INCX - jle .L999 + je .L999 sall $BASE_SHIFT, INCX fldz diff --git a/kernel/x86/nrm2_sse.S b/kernel/x86/nrm2_sse.S index 0f174c408..129b41a03 100644 --- a/kernel/x86/nrm2_sse.S +++ b/kernel/x86/nrm2_sse.S @@ -69,7 +69,7 @@ jle .L999 pxor %xmm1, %xmm1 testl INCX, INCX - jle .L999 + je .L999 leal (, INCX, SIZE), INCX cmpl $SIZE, INCX diff --git a/kernel/x86/znrm2.S b/kernel/x86/znrm2.S index 263612e9a..7a65df77a 100644 --- a/kernel/x86/znrm2.S +++ b/kernel/x86/znrm2.S @@ -78,7 +78,7 @@ testl M, M jle .L999 testl INCX, INCX - jle .L999 + je .L999 sall $ZBASE_SHIFT, INCX fldz diff --git a/kernel/x86/znrm2_sse.S b/kernel/x86/znrm2_sse.S index bbc3677ae..4ad326120 100644 --- a/kernel/x86/znrm2_sse.S +++ b/kernel/x86/znrm2_sse.S @@ -69,7 +69,7 @@ jle .L999 pxor %xmm1, %xmm1 testl INCX, INCX - jle .L999 + je .L999 sall $ZBASE_SHIFT, INCX From 34da1a067dabf10851f9c5838df64c0ed6c6a6fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Aug 2023 17:01:50 +0200 Subject: [PATCH 269/718] Allow negative INCX (API change from version 3.10 of the reference implementation) --- kernel/x86_64/nrm2.S | 2 +- kernel/x86_64/nrm2_sse.S | 2 +- kernel/x86_64/znrm2.S | 2 +- kernel/x86_64/znrm2_sse.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S index b79ac2adb..61cf8c452 100644 --- a/kernel/x86_64/nrm2.S +++ b/kernel/x86_64/nrm2.S @@ -58,7 +58,7 @@ testq M, M jle .L999 testq INCX, INCX - jle .L999 + je .L999 salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/nrm2_sse.S b/kernel/x86_64/nrm2_sse.S index 33b1ee496..c1f3a45fc 100644 --- a/kernel/x86_64/nrm2_sse.S +++ b/kernel/x86_64/nrm2_sse.S @@ -57,7 +57,7 @@ jle .L999 pxor %xmm1, %xmm1 testq INCX, INCX - jle .L999 + je .L999 pxor %xmm2, %xmm2 leaq (, INCX, SIZE), INCX diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S index 0d2aa3480..748fde310 100644 --- a/kernel/x86_64/znrm2.S +++ b/kernel/x86_64/znrm2.S @@ -58,7 +58,7 @@ testq M, M jle .L999 testq INCX, INCX - jle .L999 + je .L999 salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/znrm2_sse.S b/kernel/x86_64/znrm2_sse.S index f78b83f7e..2274f2e98 100644 --- a/kernel/x86_64/znrm2_sse.S +++ b/kernel/x86_64/znrm2_sse.S @@ -58,7 +58,7 @@ jle .L999 pxor %xmm1, %xmm1 testq INCX, INCX - jle .L999 + je .L999 xorq FLAG, FLAG From 8a8a8479be704a413233f27c7f13b2abc489f5b5 Mon Sep 17 00:00:00 2001 From: Ian McInerney Date: Mon, 14 Aug 2023 15:41:28 +0100 Subject: [PATCH 270/718] Fix cooperlake and sapphire rapids march flags on clang The march=cooperlake and march=sapphirerapids flags were never getting added when building with Clang targetting those architectures. Instead it was falling back to the skylake AVX512 implementation. Clang added support for these two architectures in Clang 9 and Clang 12, so introduce new checks for those versions to enable the appropriate march flag, and fallback to skylake otherwise. --- Makefile.system | 5 ++++ Makefile.x86_64 | 74 +++++++++++++++++++++++++++++++++---------------- kernel/Makefile | 4 +-- 3 files changed, 57 insertions(+), 26 deletions(-) diff --git a/Makefile.system b/Makefile.system index 3c1648dc7..b3968d739 100644 --- a/Makefile.system +++ b/Makefile.system @@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif +ifeq ($(C_COMPILER), CLANG) +CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) +endif + # # OS dependent settings # diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 7ab331b1f..702447ace 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -75,18 +75,31 @@ endif ifeq ($(CORE), COOPERLAKE) ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) -# cooperlake support was added in 10.1 -ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) -CCOMMON_OPT += -march=cooperlake -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=cooperlake -endif -else # gcc not support, fallback to avx512 -CCOMMON_OPT += -march=skylake-avx512 -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=skylake-avx512 -endif -endif + # cooperlake support was added in 10.1 + ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) + CCOMMON_OPT += -march=cooperlake + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=cooperlake + endif + else # gcc not support, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif +else ifeq ($(C_COMPILER), CLANG) + # cooperlake support was added in clang 9 + ifeq ($(CLANGVERSIONGTEQ9), 1) + CCOMMON_OPT += -march=cooperlake + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=cooperlake + endif + else # not supported in clang, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables @@ -104,18 +117,31 @@ endif ifeq ($(CORE), SAPPHIRERAPIDS) ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) -# sapphire rapids support was added in 11 -ifeq ($(GCCVERSIONGTEQ11), 1) -CCOMMON_OPT += -march=sapphirerapids -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=sapphirerapids -endif -else # gcc not support, fallback to avx512 -CCOMMON_OPT += -march=skylake-avx512 -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=skylake-avx512 -endif -endif + # sapphire rapids support was added in 11 + ifeq ($(GCCVERSIONGTEQ11), 1) + CCOMMON_OPT += -march=sapphirerapids + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=sapphirerapids + endif + else # gcc not support, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif +else ifeq ($(C_COMPILER), CLANG) + # cooperlake support was added in clang 12 + ifeq ($(CLANGVERSIONGTEQ12), 1) + CCOMMON_OPT += -march=cooperlake + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=cooperlake + endif + else # not supported in clang, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables diff --git a/kernel/Makefile b/kernel/Makefile index d426a1bdb..795f25eec 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -33,7 +33,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) - ifeq ($(GCCVERSIONGTEQ11), 1) + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) override CFLAGS += -march=sapphirerapids else override CFLAGS += -march=skylake-avx512 -mavx512f @@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) endif else ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) - ifeq ($(GCCVERSIONGTEQ10), 1) + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9))) override CFLAGS += -march=cooperlake else override CFLAGS += -march=skylake-avx512 -mavx512f From 0d30daa77275023fe3fe729349eaa85427a5416b Mon Sep 17 00:00:00 2001 From: TiborGY Date: Wed, 16 Aug 2023 00:07:17 +0200 Subject: [PATCH 271/718] Add junk from BF16 test to .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 1195bc9b8..8b27325db 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ test/SBLAT3.SUMM test/ZBLAT2.SUMM test/ZBLAT3.SUMM test/SHBLAT3.SUMM +test/SBBLAT3.SUMM test/cblat1 test/cblat2 test/cblat3 @@ -82,6 +83,7 @@ test/sblat1 test/sblat2 test/sblat3 test/test_shgemm +test/test_sbgemm test/zblat1 test/zblat2 test/zblat3 From b5ba95a6c0a7043f4bc9e68e9da7063a2aedfbc5 Mon Sep 17 00:00:00 2001 From: TGY Date: Wed, 16 Aug 2023 00:48:40 +0200 Subject: [PATCH 272/718] Modernize obsolete inline order --- common.h | 2 +- common_alpha.h | 2 +- common_arm.h | 2 +- common_arm64.h | 2 +- common_power.h | 2 +- common_sparc.h | 2 +- common_x86.h | 2 +- common_x86_64.h | 2 +- common_zarch.h | 2 +- kernel/power/lock.c | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/common.h b/common.h index 4eeeb8d55..4074df069 100644 --- a/common.h +++ b/common.h @@ -525,7 +525,7 @@ static inline unsigned long long rpcc(void){ #endif // !RPCC_DEFINED #if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__) -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ do { while (*address) {YIELDING;}; diff --git a/common_alpha.h b/common_alpha.h index 021eb93ae..e5380454a 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -45,7 +45,7 @@ #define WMB asm("wmb") #define RMB asm("mb") -static void __inline blas_lock(unsigned long *address){ +static __inline void blas_lock(unsigned long *address){ #ifndef __DECC unsigned long tmp1, tmp2; asm volatile( diff --git a/common_arm.h b/common_arm.h index 682315de5..a3db9953c 100644 --- a/common_arm.h +++ b/common_arm.h @@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARMV6) || defined(ARMV7) || defined(ARMV8) -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ int register ret; diff --git a/common_arm64.h b/common_arm64.h index 6a18a294c..436ccb8f5 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ BLASULONG ret; diff --git a/common_power.h b/common_power.h index a49197fd7..3fe776f23 100644 --- a/common_power.h +++ b/common_power.h @@ -91,7 +91,7 @@ void *qalloc(int flags, size_t bytes); -static void INLINE blas_lock(volatile unsigned long *address){ +static INLINE void blas_lock(volatile unsigned long *address){ long int ret, val = 1; diff --git a/common_sparc.h b/common_sparc.h index 90a24ebf1..4b9e7840a 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -45,7 +45,7 @@ #ifndef ASSEMBLER -static void __inline blas_lock(volatile unsigned long *address){ +static __inline void blas_lock(volatile unsigned long *address){ long int ret = 1; diff --git a/common_x86.h b/common_x86.h index bc77eca58..65fb9a460 100644 --- a/common_x86.h +++ b/common_x86.h @@ -54,7 +54,7 @@ #define __volatile__ #endif -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ int ret; diff --git a/common_x86_64.h b/common_x86_64.h index 729a055ce..dda168d6c 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -70,7 +70,7 @@ #define RMB #endif -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ #ifndef C_MSVC diff --git a/common_zarch.h b/common_zarch.h index 442bae821..80609251b 100644 --- a/common_zarch.h +++ b/common_zarch.h @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER /* -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ BLASULONG ret; diff --git a/kernel/power/lock.c b/kernel/power/lock.c index 1c1b006b0..de28680d8 100644 --- a/kernel/power/lock.c +++ b/kernel/power/lock.c @@ -36,7 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -static void __inline blas_lock(volatile BLASULONG *address){ +static __inline void blas_lock(volatile BLASULONG *address){ #ifdef __GNUC__ From 79c15db348b7a9365a6abf3c32ebed3f4f1bbf62 Mon Sep 17 00:00:00 2001 From: Ian McInerney Date: Mon, 14 Aug 2023 21:36:35 +0100 Subject: [PATCH 273/718] Fix power10 gcc intrinsic check __builtin_vsx_assemble_pair was only in GCC 10-11.2 and was replaced by __builtin_vsx_build_pair thereafter. --- kernel/power/dgemm_small_kernel_nn_power10.c | 2 +- kernel/power/dgemm_small_kernel_tn_power10.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c index ecdc3e5c6..73f6d5b99 100644 --- a/kernel/power/dgemm_small_kernel_nn_power10.c +++ b/kernel/power/dgemm_small_kernel_nn_power10.c @@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t; #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); -#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))) #if defined(_AIX) #define LOAD_PAIR(pair, v0, v1) \ __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); diff --git a/kernel/power/dgemm_small_kernel_tn_power10.c b/kernel/power/dgemm_small_kernel_tn_power10.c index 93a942b02..426948185 100644 --- a/kernel/power/dgemm_small_kernel_tn_power10.c +++ b/kernel/power/dgemm_small_kernel_tn_power10.c @@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t; #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); -#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))) #if defined(_AIX) #define LOAD_PAIR(pair, v0, v1) \ __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); From 214be14c1d21b01713f3d89217de6a67dc23f369 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Aug 2023 22:48:30 +0200 Subject: [PATCH 274/718] Correct INFO returned for lda in non-CBLAS s/dgeadd --- interface/geadd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/geadd.c b/interface/geadd.c index f0befa14a..3a0ea015d 100644 --- a/interface/geadd.c +++ b/interface/geadd.c @@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, info = 0; - if (lda < MAX(1, m)) info = 6; + if (lda < MAX(1, m)) info = 5; if (ldc < MAX(1, m)) info = 8; if (n < 0) info = 2; From 553cc1372fa4fca3b5f71a92fa6218478af3c714 Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 18 Aug 2023 17:39:44 +0800 Subject: [PATCH 275/718] LoongArch64: Add sgemm_kernel --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 15 + kernel/loongarch64/loongarch64_asm.S | 94 + kernel/loongarch64/sgemm_kernel_16x8_lasx.S | 2325 +++++++++++++++++++ kernel/loongarch64/sgemm_ncopy_16_lasx.S | 463 ++++ kernel/loongarch64/sgemm_ncopy_8_lasx.S | 298 +++ kernel/loongarch64/sgemm_tcopy_16_lasx.S | 526 +++++ kernel/loongarch64/sgemm_tcopy_8_lasx.S | 406 ++++ param.h | 12 +- 8 files changed, 4134 insertions(+), 5 deletions(-) create mode 100644 kernel/loongarch64/sgemm_kernel_16x8_lasx.S create mode 100644 kernel/loongarch64/sgemm_ncopy_16_lasx.S create mode 100644 kernel/loongarch64/sgemm_ncopy_8_lasx.S create mode 100644 kernel/loongarch64/sgemm_tcopy_16_lasx.S create mode 100644 kernel/loongarch64/sgemm_tcopy_8_lasx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 9d858584c..67d1fd11c 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -11,9 +11,24 @@ DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMVNKERNEL = dgemv_n_8_lasx.S DGEMVTKERNEL = dgemv_t_8_lasx.S + +SGEMMKERNEL = sgemm_kernel_16x8_lasx.S +SGEMMINCOPY = sgemm_ncopy_16_lasx.S +SGEMMITCOPY = sgemm_tcopy_16_lasx.S +SGEMMONCOPY = sgemm_ncopy_8_lasx.S +SGEMMOTCOPY = sgemm_tcopy_8_lasx.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) endif DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S index 8876cbed9..89243c620 100644 --- a/kernel/loongarch64/loongarch64_asm.S +++ b/kernel/loongarch64/loongarch64_asm.S @@ -36,6 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PTR_ST st.d #define PTR_SLLI slli.d #define PTR_SRLI srli.d +#define PTR_SRAI srai.d +#define PTR_MUL mul.d #define PTR_ALSL alsl.d #else #define LA_REG int32_t @@ -48,6 +50,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PTR_ST st.w #define PTR_SLLI slli.w #define PTR_SRLI srli.w +#define PTR_SRAI srai.w +#define PTR_MUL mul.w #define PTR_ALSL alsl.w #endif @@ -218,6 +222,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endif .endm // +// GSUB +// +.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()sub.\suf_op \out, \in0, \in1 +.ifnb \more + GSUB \pre_op, \suf_op, \more +.endif +.endm +// // GSLLI // .macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg @@ -244,6 +257,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GXOR \pre_op, \suf_op, \more .endif .endm +// +// GPERMI +// +.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()permi.\suf_op \out, \in0, \in1 +.ifnb \more + GPERMI \pre_op, \suf_op, \more +.endif +.endm +// +// GNMSUB +// +.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg + \pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2 +.ifnb \more + GNMSUB \pre_op, \suf_op, \more +.endif +.endm +// +// GPRELD +// +.macro GPRELD in0:req, in1:req, in2:req, more:vararg + preld \in0, \in1, \in2 +.ifnb \more + GPRELD \more +.endif +.endm // // Compound instructions @@ -311,3 +351,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GACC \pre_op, \suf_op, \more .endif .endm +// +// GMOV +// +.macro GMOV pre_op:req, out:req, in:req, more:vararg + \pre_op\()or.v \out, \in, \in +.ifnb \more + GMOV \pre_op, \more +.endif +.endm + +// +// Media Related Macros +// +.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1 + \pre_op\()ilvl.\suf_op \out0, \in0, \in1 + \pre_op\()ilvh.\suf_op \out1, \in0, \in1 +.endm +.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1 + \pre_op\()pickev.\suf_op \out0, \in0, \in1 + \pre_op\()pickod.\suf_op \out1, \in0, \in1 +.endm + +// +// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors, +// has no pre_op param. 128-bit vector instructions are not supported. +// +.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ + vt0, vt1 + GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0 + GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2 + GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3 + GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02 +.endm + +.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \ + in0, in1, in2, in3, in4, in5, in6, in7, \ + tmp0, tmp1, tmp2, tmp3 + GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0 + GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1 + GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0 + GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2 + + GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4 + GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5 + GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0 + GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2 + + GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3 + + GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \ + \out2, \out6, 0x02, \out3, \out7, 0x02, \ + \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \ + \out6, \tmp2, 0x31, \out7, \tmp3, 0x31 +.endm diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S new file mode 100644 index 000000000..254dbe052 --- /dev/null +++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S @@ -0,0 +1,2325 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2023/08/23 guxiwei +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 8 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 256 +* SGEMM_DEFAULT_Q 256 +* SGEMM_DEFAULT_R 1024 +* A_PRE 1024 +* B_PRE 256 // Enable prefetching for B results in a performance decrease, temporarily disabled. +* +* +* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: +* 1 thread: 71.7 GFLOPS +* 2 threads: 142.6 GFLOPS +* 3 threads: 211.5 GFLOPS +* 4 threads: 265.0 GFLOPS +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define C4 $r25 +#define C5 $r26 +#define C6 $r27 +#define C7 $r28 +#define T0 $r29 +#define T1 $r30 +#undef ZERO +#define ZERO $r0 + +/* LASX Vectors + * Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data. + * Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data. + * Use D0 to D15 to store intermediate values of the computation. + * Use VALPHA to store the broadcast value of alpha + */ +#define U0 $xr0 +#define U1 $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define D0 $xr10 +#define D1 $xr11 +#define D2 $xr12 +#define D3 $xr13 +#define D4 $xr14 +#define D5 $xr15 +#define D6 $xr16 +#define D7 $xr17 +#define D8 $xr18 +#define D9 $xr19 +#define D10 $xr20 +#define D11 $xr21 +#define D12 $xr22 +#define D13 $xr23 +#define D14 $xr24 +#define D15 $xr25 +#define VALPHA $xr26 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +// Loops outline: +// .L_N8 <-------------------------------------------------------------------------------------------- /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */ +// | .L_M16 <--------------------- | /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */ +// | | .L_M16_TL1 | | +// | | .L_M16_L7 | The entire core loop of the function, KERNEK16x8 | +// | | .L_M16_L71 | | +// | | .L_M16_L0 ---------------- | +// | .L_M8 | +// | | .L_M8_TL1 | | +// | | .L_M8_L7 | KERNEK8x8 | +// | | .L_M8_L71 | | +// | | .L_M8_L0 | | +// | .L_M4 | +// | | .L_M4_TL1 | | +// | | .L_M4_L7 | KERNEK4x8 | +// | | .L_M4_L71 | | +// | | .L_M4_L0 | | +// | .L_M2 | +// | | .L_M2_TL1 | | +// | | .L_M2_L7 | KERNEK2x8 | +// | | .L_M2_L71 | | +// | | .L_M2_L0 | | +// | .L_M1 | +// | | .L_M1_TL1 | | +// | | .L_M1_L7 | KERNEK1x8 | +// | | .L_M1_L71 | | +// | | .L_M1_L0 | | +// | .L_M0------------------------------------------------------------------------------------------ +// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */ +// .L_N4 +// | .L_N4_M16 <--------------------- +// | | .L_N4_M16_TL1 | +// | | .L_N4_M16_L7 | KERNEL16x4 +// | | .L_N4_M16_L71 | +// | | .L_N4_M16_L0 ---------------- +// | .L_N4_M8 +// | | .L_N4_M8_TL1 | +// | | .L_N4_M8_L7 | KERNEL8x4 +// | | .L_N4_M8_L71 | +// | | .L_N4_M8_L0 | +// | .L_N4_M4 +// | | .L_N4_M4_TL1 | +// | | .L_N4_M4_L7 | KERNEL4x4 +// | | .L_N4_M4_L71 | +// | | .L_N4_M4_L0 | +// | .L_N4_M2 +// | | .L_N4_M2_TL1 | +// | | .L_N4_M2_L7 | KERNEL2x4 +// | | .L_N4_M2_L71 | +// | | .L_N4_M2_L0 | +// | .L_N4_M1 +// | | .L_N4_M1_TL1 | +// | | .L_N4_M1_L7 | KERNEL1x4 +// | | .L_N4_M1_L71 | +// | | .L_N4_M1_L0 | +// | .L_N4_M0 +// .L_N3 /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */ +// .L_N2 +// | .L_N2_M16 <--------------------- +// | | .L_N2_M16_TL1 | +// | | .L_N2_M16_L7 | KERNEL16x2 +// | | .L_N2_M16_L71 | +// | | .L_N2_M16_L0 ---------------- +// | .L_N2_M8 +// | | .L_N2_M8_TL1 | +// | | .L_N2_M8_L7 | KERNEL8x2 +// | | .L_N2_M8_L71 | +// | | .L_N2_M8_L0 | +// | .L_N2_M4 +// | | .L_N2_M4_TL1 | +// | | .L_N2_M4_L7 | KERNEL4x2 +// | | .L_N2_M4_L71 | +// | | .L_N2_M4_L0 | +// | .L_N2_M2 +// | | .L_N2_M2_TL1 | +// | | .L_N2_M2_L7 | KERNEL2x2 +// | | .L_N2_M2_L71 | +// | | .L_N2_M2_L0 | +// | .L_N2_M1 +// | | .L_N2_M1_TL1 | +// | | .L_N2_M1_L7 | KERNEL1x2 +// | | .L_N2_M1_L71 | +// | | .L_N2_M1_L0 | +// | .L_N2_M0 +// .L_N1 +// | .L_N1_M16 <--------------------- +// | | .L_N1_M16_TL1 | +// | | .L_N1_M16_L7 | KERNEL16x1 +// | | .L_N1_M16_L71 | +// | | .L_N1_M16_L0 ---------------- +// | .L_N1_M8 +// | | .L_N1_M8_TL1 | +// | | .L_N1_M8_L7 | KERNEL8x1 +// | | .L_N1_M8_L71 | +// | | .L_N1_M8_L0 | +// | .L_N1_M4 +// | | .L_N1_M4_TL1 | +// | | .L_N1_M4_L7 | KERNEL4x1 +// | | .L_N1_M4_L71 | +// | | .L_N1_M4_L0 | +// | .L_N1_M2 +// | | .L_N1_M2_TL1 | +// | | .L_N1_M2_L7 | KERNEL2x1 +// | | .L_N1_M2_L71 | +// | | .L_N1_M2_L0 | +// | .L_N1_M1 +// | | .L_N1_M1_TL1 | +// | | .L_N1_M1_L7 | KERNEL1x1 +// | | .L_N1_M1_L71 | +// | | .L_N1_M1_L0 | +// | .L_N1_M0 +// .L_N0 + +/*************** sgemm_kernel_macros ***************/ +.macro KERNEL1x16x8_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D1, U1, X0 + preld 0, C0, 0x00 + GMUL xvf, s, D2, U0, X1, D3, U1, X1 + preld 0, C1, 0x00 + GMUL xvf, s, D4, U0, X2, D5, U1, X2 + preld 0, C2, 0x00 + GMUL xvf, s, D6, U0, X3, D7, U1, X3 + preld 0, C3, 0x00 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMUL xvf, s, D8, U0, X4, D9, U1, X4 + preld 0, C4, 0x00 + GMUL xvf, s, D10, U0, X5, D11, U1, X5 + preld 0, C5, 0x00 + GMUL xvf, s, D12, U0, X6, D13, U1, X6 + preld 0, C6, 0x00 + GMUL xvf, s, D14, U0, X7, D15, U1, X7 + preld 0, C7, 0x00 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL1x16x8 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3 + preld 0, A0, A_PRE + GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \ + D6, U0, X3, D6, D7, U1, X3 D7 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMADD xvf, s, D8, U0, X4, D8, D9, U1, X4, D9, \ + D10, U0, X5, D10, D11, U1, X5, D11 + //preld 0, B0, B_PRE + GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \ + D14, U0, X7, D14, D15, U1, X7 D15 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL8x16x8 +.rept 8 + KERNEL1x16x8 +.endr +.endm + +.macro SAVE16x8 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ + D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ + D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 + /* Load C2 */ + GLD xv, , X4, C2, 0x00, X5, C2, 0x20 + GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 + /* Load C3 */ + GLD xv, , X6, C3, 0x00, X7, C3, 0x20 + GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 + /* Load C4 */ + GLD xv, , X0, C4, 0x00, X1, C4, 0x20 + GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1 + /* Load C5 */ + GLD xv, , X2, C5, 0x00, X3, C5, 0x20 + GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3 + /* Load C6 */ + GLD xv, , X4, C6, 0x00, X5, C6, 0x20 + GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5 + /* Load C7 */ + GLD xv, , X6, C7, 0x00, X7, C7, 0x20 + GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20, \ + D4, C2, 0x00, D5, C2, 0x20, \ + D6, C3, 0x00, D7, C3, 0x20, \ + D8, C4, 0x00, D9, C4, 0x20, \ + D10, C5, 0x00, D11, C5, 0x20, \ + D12, C6, 0x00, D13, C6, 0x20, \ + D14, C7, 0x00, D15, C7, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ + C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 +#else + GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ + C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx8_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ + D4, U0, X2, D6, U0, X3 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMUL xvf, s, D8, U0, X4, D10, U0, X5, \ + D12, U0, X6, D14, U0, X7 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL1xMx8 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ + D4, U0, X2, D4, D6, U0, X3, D6 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMADD xvf, s, D8, U0, X4, D8, D10, U0, X5, D10, \ + D12, U0, X6, D12, D14, U0, X7, D14 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL8xMx8 m, stride +.rept 8 + KERNEL1xMx8 \m, \stride +.endr +.endm + +.macro SAVEMx8 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ + D4, D4, VALPHA, D6, D6, VALPHA, \ + D8, D8, VALPHA, D10, D10, VALPHA, \ + D12, D12, VALPHA, D14, D14, VALPHA +#else + /* Load C0, C1, C2, C3, C4, C5, C6, C7 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ + D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 +.if \m == 8 + GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00 +.elseif \m == 4 + GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 +.endif + GMADD xvf, s, D8, D8, VALPHA, X0, D10, D10, VALPHA, X2, \ + D12, D12, VALPHA, X4, D14, D14, VALPHA, X6 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ + D4, C2, 0x00, D6, C3, 0x00, \ + D8, C4, 0x00, D10, C5, 0x00, \ + D12, C6, 0x00, D14, C7, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ + $vr14, C2, 0x00, $vr16, C3, 0x00, \ + $vr18, C4, 0x00, $vr20, C5, 0x00, \ + $vr22, C6, 0x00, $vr24, C7, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00, \ + $f18, C4, 0x00, $f20, C5, 0x00, \ + $f22, C6, 0x00, $f24, C7, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00, \ + $f18, C4, 0x00, $f20, C5, 0x00, \ + $f22, C6, 0x00, $f24, C7, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ + C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride +#else + GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ + C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride +#endif +.endm + +.macro KERNEL1x16x4_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ + D2, U0, X1, D3, U1, X1, \ + D4, U0, X2, D5, U1, X2, \ + D6, U0, X3, D7, U1, X3 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL1x16x4 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3, \ + D4, U0, X2, D4, D5, U1, X2, D5, \ + D6, U0, X3, D6, D7, U1, X3 D7 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL8x16x4 +.rept 8 + KERNEL1x16x4 +.endr +.endm + +.macro SAVE16x4 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 + /* Load C2 */ + GLD xv, , X4, C2, 0x00, X5, C2, 0x20 + GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 + /* Load C3 */ + GLD xv, , X6, C3, 0x00, X7, C3, 0x20 + GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20, \ + D4, C2, 0x00, D5, C2, 0x20, \ + D6, C3, 0x00, D7, C3, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 +#else + GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx4_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ + D4, U0, X2, D6, U0, X3 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL1xMx4 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ + D4, U0, X2, D4, D6, U0, X3, D6 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL8xMx4 m, stride +.rept 8 + KERNEL1xMx4 \m, \stride +.endr +.endm + +.macro SAVEMx4 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ + D4, D4, VALPHA, D6, D6, VALPHA +#else + /* Load C0, C1, C2, C3 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ + D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ + D4, C2, 0x00, D6, C3, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ + $vr14, C2, 0x00, $vr16, C3, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride +#else + GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride +#endif +.endm + +.macro KERNEL1x16x2_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ + D2, U0, X1, D3, U1, X1 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL1x16x2 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL8x16x2 +.rept 8 + KERNEL1x16x2 +.endr +.endm + +.macro SAVE16x2 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40 +#else + GADDI , w, C0, C0, 0x40, C1, C1, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx2_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMUL xvf, s, D0, U0, X0, D2, U0, X1 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL1xMx2 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL8xMx2 m, stride +.rept 8 + KERNEL1xMx2 \m, \stride +.endr +.endm + +.macro SAVEMx2 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA +#else + /* Load C0, C1 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride +#else + GADDI , w, C0, C0, \stride, C1, C1, \stride +#endif +.endm + +.macro KERNEL1x16x1_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + GLDREPL xv, w, X0, B0, 0x00 + GMUL xvf, s, D0, U0, X0, D1, U1, X0 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL1x16x1 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + GLDREPL xv, w, X0, B0, 0x00 + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL8x16x1 +.rept 8 + KERNEL1x16x1 +.endr +.endm + +.macro SAVE16x1 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40 +#else + GADDI , w, C0, C0, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx1_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00 + GMUL xvf, s, D0, U0, X0 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL1xMx1 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00 + GMADD xvf, s, D0, U0, X0, D0 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL8xMx1 m, stride +.rept 8 + KERNEL1xMx1 \m, \stride +.endr +.endm + +.macro SAVEMx1 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA +#else + /* Load C0, C1 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride +#else + GADDI , w, C0, C0, \stride +#endif +.endm + + PROLOGUE + push_if_used 26, 32 + xvreplve0.w VALPHA, $xr0 +#if defined (TRMMKERNEL) && !defined(LEFT) + PTR_SUB OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + /* if (!(N >> 3)) goto L_N7 */ + PTR_SRAI J, N, 3 /* J = bn >> 3 */ + andi N, N, 0x07 + beq ZERO, J, .L_N7 +.L_N8: /* J -- */ + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 + PTR_ADDI J, J, -1 /* J-- */ +#if __loongarch_grlen == 64 + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ + C6, C5, T0, C7, C6, T0 +#else + GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ + C6, C5, T0, C7, C6, T0 +#endif +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + /* if (!(M >> 4)) goto L_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 +.align 5 +.L_M16: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x8_START + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M16_L7 */ + beq ZERO,TL, .L_M16_L7 +.align 5 +.L_M16_TL1: + KERNEL8x16x8 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M16_TL1 +.L_M16_L7: + andi TL, L, 7 + beq TL, ZERO,.L_M16_L0 +.align 5 +.L_M16_L71: + KERNEL1x16x8 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M16_L71 +.L_M16_L0: + SAVE16x8 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -16 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 /* number of values in A */ +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_M16 +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 /* A0 += 8 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + KERNEL1xMx8_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 +.align 5 +.L_M8_TL1: + KERNEL8xMx8 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M8_TL1 +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M8_L0 +.align 5 +.L_M8_L71: + KERNEL1xMx8 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M8_L71 +.L_M8_L0: + SAVEMx8 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -8 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 /* A0 += 4 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 +.align 5 +.L_M4_TL1: + KERNEL8xMx8 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 +.L_M4_L71: + KERNEL1xMx8 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M4_L71 +.L_M4_L0: + SAVEMx8 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -4 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 2, 0x08 + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 +.align 5 +.L_M2_TL1: + KERNEL8xMx8 2, 0x08 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 +.align 5 +.L_M2_L71: + KERNEL1xMx8 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M2_L71 +.L_M2_L0: + SAVEMx8 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -2 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 +.align 5 +.L_M1_TL1: + KERNEL8xMx8 1, 0x04 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 +.align 5 +.L_M1_L71: + KERNEL1xMx8 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M1_L71 +.L_M1_L0: + SAVEMx8 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -1 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + PTR_SLLI T0, K, 5 + PTR_SLLI T1, LDC, 5 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x08 /* number of values in B */ +#endif + blt ZERO, J, .L_N8 + +.L_N7: + andi J, N, 4 + beq ZERO, J, .L_N3 +.L_N4: + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 +#if __loongarch_grlen == 64 + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 +#else + GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0 +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N4_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N4_M8 +.align 5 +.L_N4_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 /* B0 += 4 * OFF */ +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x4_START + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_L7 */ + beq ZERO,TL, .L_N4_M16_L7 +.align 5 +.L_N4_M16_TL1: /* TL-- */ + KERNEL8x16x4 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M16_TL1 +.L_N4_M16_L7: + /* if (!(L & 7)) goto L_N4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M16_L0 +.align 5 +.L_N4_M16_L71: + KERNEL1x16x4 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M16_L71 +.L_N4_M16_L0: + SAVE16x4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N4_M16 +.L_N4_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N4_M0 + + andi I, M, 8 + beq ZERO,I, .L_N4_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M8_L7 */ + beq ZERO,TL, .L_N4_M8_L7 +.align 5 +.L_N4_M8_TL1: /* TL-- */ + KERNEL8xMx4 8, 0x20 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M8_TL1 +.L_N4_M8_L7: + /* if (!(L & 7)) goto L_N4_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M8_L0 +.align 5 +.L_N4_M8_L71: + KERNEL1xMx4 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M8_L71 +.L_N4_M8_L0: + SAVEMx4 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M4: + andi I, M, 4 + beq ZERO,I, .L_N4_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M4_L7 */ + beq ZERO,TL, .L_N4_M4_L7 +.align 5 +.L_N4_M4_TL1: /* TL-- */ + KERNEL8xMx4 4, 0x10 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M4_TL1 +.L_N4_M4_L7: + /* if (!(L & 7)) goto L_N4_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M4_L0 +.align 5 +.L_N4_M4_L71: + KERNEL1xMx4 4, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M4_L71 +.L_N4_M4_L0: + SAVEMx4 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M2: + andi I, M, 2 + beq ZERO,I, .L_N4_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M2_L7 */ + beq ZERO,TL, .L_N4_M2_L7 +.align 5 +.L_N4_M2_TL1: /* TL-- */ + KERNEL8xMx4 2, 0x08 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M2_TL1 +.L_N4_M2_L7: + /* if (!(L & 7)) goto L_N4_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M2_L0 +.align 5 +.L_N4_M2_L71: + KERNEL1xMx4 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M2_L71 +.L_N4_M2_L0: + SAVEMx4 2, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M1: + andi I, M, 1 + beq ZERO,I, .L_N4_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M1_L7 */ + beq ZERO,TL, .L_N4_M1_L7 +.align 5 +.L_N4_M1_TL1: /* TL-- */ + KERNEL8xMx4 1, 0x04 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M1_TL1 +.L_N4_M1_L7: + /* if (!(L & 7)) goto L_N4_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M1_L0 +.align 5 +.L_N4_M1_L71: + KERNEL1xMx4 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M1_L71 +.L_N4_M1_L0: + SAVEMx4 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M0: + /* Add stride for B and C + * B += 4 * K + * C += 4 * LDC + */ + PTR_SLLI T0, K, 4 + PTR_SLLI T1, LDC, 4 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x04 +#endif + /* We must reinit I */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +.L_N2: + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 + PTR_ADD C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N2_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N2_M8 +.align 5 +.L_N2_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x2_START + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M16_L7 */ + beq ZERO,TL, .L_N2_M16_L7 +.align 5 +.L_N2_M16_TL1: /* TL-- */ + KERNEL8x16x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M16_TL1 +.L_N2_M16_L7: + /* if (!(L & 7)) goto L_N2_M16_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M16_L0 +.align 5 +.L_N2_M16_L71: + KERNEL1x16x2 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M16_L71 +.L_N2_M16_L0: + SAVE16x2 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N2_M16 +.L_N2_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N2_M0 + + andi I, M, 8 + beq ZERO,I, .L_N2_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M8_L7 */ + beq ZERO,TL, .L_N2_M8_L7 +.align 5 +.L_N2_M8_TL1: /* TL-- */ + KERNEL8xMx2 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M8_TL1 +.L_N2_M8_L7: + /* if (!(L & 7)) goto L_N2_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M8_L0 +.align 5 +.L_N2_M8_L71: + KERNEL1xMx2 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M8_L71 +.L_N2_M8_L0: + SAVEMx2 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M4: + andi I, M, 4 + beq ZERO,I, .L_N2_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M4_L7 */ + beq ZERO,TL, .L_N2_M4_L7 +.align 5 +.L_N2_M4_TL1: /* TL-- */ + KERNEL8xMx2 4, 0x10 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M4_TL1 +.L_N2_M4_L7: + /* if (!(L & 7)) goto L_N2_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M4_L0 +.align 5 +.L_N2_M4_L71: + KERNEL1xMx2 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M4_L71 +.L_N2_M4_L0: + SAVEMx2 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M2: + andi I, M, 2 + beq ZERO,I, .L_N2_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M2_L7 */ + beq ZERO,TL, .L_N2_M2_L7 +.align 5 +.L_N2_M2_TL1: /* TL-- */ + KERNEL8xMx2 2, 0x08 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M2_TL1 +.L_N2_M2_L7: + /* if (!(L & 7)) goto L_N2_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M2_L0 +.align 5 +.L_N2_M2_L71: + KERNEL1xMx2 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M2_L71 +.L_N2_M2_L0: + SAVEMx2 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M1: + andi I, M, 1 + beq ZERO,I, .L_N2_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M1_L7 */ + beq ZERO,TL, .L_N2_M1_L7 +.align 5 +.L_N2_M1_TL1: /* TL-- */ + KERNEL8xMx2 1, 0x04 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M1_TL1 +.L_N2_M1_L7: + /* if (!(L & 7)) goto L_N2_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M1_L0 +.align 5 +.L_N2_M1_L71: + KERNEL1xMx2 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M1_L71 +.L_N2_M1_L0: + SAVEMx2 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M0: + /* Add stride for B and C + * B += 2 * K + * C += 2 * LDC + */ + PTR_SLLI T0, K, 3 + PTR_SLLI T1, LDC, 3 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x02 +#endif + /* We must reinit I */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + /* if (!(M >> 4)) goto L_N1_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M8 +.L_N1_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x1_START + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M16_L7 */ + beq ZERO,TL, .L_N1_M16_L7 +.align 5 +.L_N1_M16_TL1: /* TL-- */ + KERNEL8x16x1 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M16_TL1 +.L_N1_M16_L7: + /* if (!(L & 7)) goto L_N1_M16_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M16_L0 +.align 5 +.L_N1_M16_L71: + KERNEL1x16x1 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M16_L71 +.L_N1_M16_L0: + SAVE16x1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_M16 +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 + + andi I, M, 8 + beq ZERO,I, .L_N1_M4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 +.align 5 +.L_N1_M8_TL1: /* TL-- */ + KERNEL8xMx1 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M8_TL1 +.L_N1_M8_L7: + /* if (!(L & 7)) goto L_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M8_L0 +.align 5 +.L_N1_M8_L71: + KERNEL1xMx1 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M8_L71 +.L_N1_M8_L0: + SAVEMx1 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M4: + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 +.align 5 +.L_N1_M4_TL1: /* TL-- */ + KERNEL8xMx1 4, 0x10 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 +.align 5 +.L_N1_M4_L71: + KERNEL1xMx1 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 +.L_N1_M4_L0: + SAVEMx1 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 +.align 5 +.L_N1_M2_TL1: /* TL-- */ + KERNEL8xMx1 2, 0x08 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 +.align 5 +.L_N1_M2_L71: + KERNEL1xMx1 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 +.L_N1_M2_L0: + SAVEMx1 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 +.align 5 +.L_N1_M1_TL1: /* TL-- */ + KERNEL8xMx1 1, 0x04 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 +.align 5 +.L_N1_M1_L71: + KERNEL1xMx1 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 +.L_N1_M1_L0: + SAVEMx1 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M0: +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S new file mode 100644 index 000000000..266c07c5c --- /dev/null +++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S @@ -0,0 +1,463 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define S11 $r24 +#define S12 $r25 +#define S13 $r26 +#define S14 $r27 +#define S15 $r28 +#define S16 $r29 +#define TD $r30 +#define TS $r31 +#define TL $r7 +#define T0 $r6 +#undef ZERO +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 + +// Loops outline +//.L_N16 <------------------- +//| .L_M8: | +//| .L_M7: | Main Loop +//| .L_M1: | +//| .L_M0: --------------- +//.L_N15: +//.L_N8: +//| .L_N8_M8: +//| .L_N8_M7: +//| .L_N8_M1: +//.L_N7: +//.L_N4: +//| .L_N4_M4: +//| .L_N4_M3: +//| .L_N4_M1: +//.L_N3: +//.L_N2: +//| .L_N2_M2: +//| .L_N2_M1: +//.L_N1: +//| .L_N1_M1: +//.L_N0 + + PROLOGUE + push_if_used 26, 32 + + move TD, DST + move TS, SRC + PTR_SLLI TL, LDA, 0x02 + PTR_SLLI T0, TL, 0x01 + PTR_SRAI J, N, 0x04 + beq J, ZERO, .L_N15 +.align 5 +.L_N16: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADDI J, J, -1 + PTR_ADD S4, S3, TL + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S9, S7, T0 + PTR_ADD S10, S8, T0 + PTR_ADD S11, S9, T0 + PTR_ADD S12, S10, T0 + PTR_ADD S13, S11, T0 + PTR_ADD S14, S12, T0 + PTR_ADD S15, S13, T0 + PTR_ADD S16, S14, T0 + PTR_ADD TS, S15, T0 + beq I, ZERO, .L_M7 +.align 5 +.L_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + xvld U8, S9, 0x00 + xvld U9, S10, 0x00 + xvld U10, S11, 0x00 + xvld U11, S12, 0x00 + xvld U12, S13, 0x00 + xvld U13, S14, 0x00 + xvld U14, S15, 0x00 + xvld U15, S16, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \ + U8, U9, U10, U11, U12, U13, U14, U15, \ + U0, U1, U2, U3 // As tmp + GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \ + D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \ + D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI S9, S9, 0x20 + PTR_ADDI S10, S10, 0x20 + PTR_ADDI S11, S11, 0x20 + PTR_ADDI S12, S12, 0x20 + PTR_ADDI S13, S13, 0x20 + PTR_ADDI S14, S14, 0x20 + PTR_ADDI S15, S15, 0x20 + PTR_ADDI S16, S16, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M8 +.L_M7: + andi I, M, 0x07 + beq I, ZERO, .L_M0 +.align 5 +.L_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0C + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1C + + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI TD, TD, 0x20 + + fld.s F0, S9, 0x00 + fld.s F1, S10, 0x00 + fld.s F2, S11, 0x00 + fld.s F3, S12, 0x00 + fld.s F4, S13, 0x00 + fld.s F5, S14, 0x00 + fld.s F6, S15, 0x00 + fld.s F7, S16, 0x00 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0C + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1C + + PTR_ADDI S9, S9, 0x04 + PTR_ADDI S10, S10, 0x04 + PTR_ADDI S11, S11, 0x04 + PTR_ADDI S12, S12, 0x04 + PTR_ADDI S13, S13, 0x04 + PTR_ADDI S14, S14, 0x04 + PTR_ADDI S15, S15, 0x04 + PTR_ADDI S16, S16, 0x04 + PTR_ADDI TD, TD, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M1 +.L_M0: + blt ZERO, J, .L_N16 +.L_N15: + andi J, N, 0x0f + beq ZERO, J, .L_N0 + + andi J, N, 0x08 + beq ZERO, J, .L_N7 +.L_N8: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD TS, S7, T0 + beq I, ZERO, .L_N8_M7 +.align 5 +.L_N8_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ + D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N8_M8 +.L_N8_M7: + andi I, M, 0x07 + beq I, ZERO, .L_N7 +.align 5 +.L_N8_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + fst.s F4, TD, 0x10 + PTR_ADDI S5, S5, 0x04 + fst.s F5, TD, 0x14 + PTR_ADDI S6, S6, 0x04 + fst.s F6, TD, 0x18 + PTR_ADDI S7, S7, 0x04 + fst.s F7, TD, 0x1C + PTR_ADDI S8, S8, 0x04 + + PTR_ADDI TD, TD, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N8_M1 +.L_N7: + andi J, N, 0x07 + beq ZERO, J, .L_N0 + + andi J, N, 0x04 + beq ZERO, J, .L_N3 +.L_N4: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x02 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD TS, S3, T0 + beq I, ZERO, .L_N4_M3 +.align 5 +.L_N4_M4: + GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 + GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 + GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 + GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 + GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 + GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI TD, TD, 0x40 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M4 +.L_N4_M3: + andi I, M, 0x03 + beq I, ZERO, .L_N3 +.align 5 +.L_N4_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + + PTR_ADDI TD, TD, 0x10 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M1 +.L_N3: + andi J, N, 0x03 + beq ZERO, J, .L_N0 + + andi J, N, 0x02 + beq ZERO, J, .L_N1 +.L_N2: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x01 + PTR_ADD TS, S2, TL + beq I, ZERO, .L_N2_M1 +.align 5 +.L_N2_M2: + GLD f, d, F0, S1, 0x00, F1, S2, 0x00 + vilvl.w $vr0, $vr1, $vr0 + GST v, , $vr0, TD, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI TD, TD, 0x10 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N2_M2 +.L_N2_M1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI TD, TD, 0x08 +.align 5 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_N1_M1: + fld.s F0, S1, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F0, TD, 0x00 + PTR_ADDI TD, TD, 0x04 + PTR_ADDI M, M, -1 + blt ZERO, M, .L_N1_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S new file mode 100644 index 000000000..5c173568b --- /dev/null +++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S @@ -0,0 +1,298 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r6 +#undef ZERO +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 +#define D10 $xr17 +#define D12 $xr18 +#define D14 $xr19 + +// Loops outline +//.L_N8: <---------------- +//| .L_M8: | +//| .L_M7: | Main Loop +//| .L_M1: | +//| .L_M0:-------------- +//.L_N7: +//.L_N4: +//| .L_N4_M4: +//| .L_N4_M3: +//| .L_N4_M1: +//.L_N3: +//.L_N2: +//| .L_N2_M2: +//| .L_N2_M1: +//.L_N1: +//| .L_N1_M1: +//.L_N0 + + PROLOGUE + push_if_used 17, 20 + + move TD, DST + move TS, SRC + PTR_SLLI TL, LDA, 0x02 + PTR_SLLI T0, TL, 0x01 + PTR_SRAI J, N, 0x03 + beq J, ZERO, .L_N7 +.align 5 +.L_N8: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADDI J, J, -1 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD TS, S7, T0 + beq I, ZERO, .L_M7 +.align 5 +.L_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ + D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M8 +.L_M7: + andi I, M, 0x07 + beq I, ZERO, .L_M0 +.align 5 +.L_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + fst.s F4, TD, 0x10 + PTR_ADDI S5, S5, 0x04 + fst.s F5, TD, 0x14 + PTR_ADDI S6, S6, 0x04 + fst.s F6, TD, 0x18 + PTR_ADDI S7, S7, 0x04 + fst.s F7, TD, 0x1C + PTR_ADDI S8, S8, 0x04 + + PTR_ADDI TD, TD, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M1 +.L_M0: + blt ZERO, J, .L_N8 +.L_N7: + andi J, N, 0x07 + beq ZERO, J, .L_N0 + + andi J, N, 0x04 + beq ZERO, J, .L_N3 +.L_N4: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x02 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD TS, S3, T0 + beq I, ZERO, .L_N4_M3 +.align 5 +.L_N4_M4: + GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 + GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 + GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 + GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 + GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 + GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI TD, TD, 0x40 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M4 +.L_N4_M3: + andi I, M, 0x03 + beq I, ZERO, .L_N3 +.align 5 +.L_N4_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + + PTR_ADDI TD, TD, 0x10 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M1 +.L_N3: + andi J, N, 0x03 + beq ZERO, J, .L_N0 + + andi J, N, 0x02 + beq ZERO, J, .L_N1 +.L_N2: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x01 + PTR_ADD TS, S2, TL + beq I, ZERO, .L_N2_M1 +.align 5 +.L_N2_M2: + GLD f, d, F0, S1, 0x00, F1, S2, 0x00 + vilvl.w $vr0, $vr1, $vr0 + GST v, , $vr0, TD, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI TD, TD, 0x10 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N2_M2 +.L_N2_M1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI TD, TD, 0x08 +.align 5 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_N1_M1: + fld.s F0, S1, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F0, TD, 0x00 + PTR_ADDI TD, TD, 0x04 + PTR_ADDI M, M, -1 + blt ZERO, M, .L_N1_M1 +.L_N0: + pop_if_used 17, 20 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S new file mode 100644 index 000000000..d9789bdcd --- /dev/null +++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S @@ -0,0 +1,526 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + +// Loops outline +//.L_M8 <------------------- +//| .L_N16: | +//| .L_N15: | +//| .L_N8: | +//| .L_N7: | Main Loop +//| .L_N4: | +//| .L_N3: | +//| .L_N2: | +//| .L_N1: | +//| .L_N0: --------------- +//.L_M7 +//.L_M4 +//| .L_M4_N16: +//| .L_M4_N15: +//| .L_M4_N8: +//| .L_M4_N7: +//| .L_M4_N4: +//| .L_M4_N3: +//| .L_M4_N2: +//| .L_M4_N1: +//.L_M3 +//.L_M2 +//| .L_M2_N16: +//| .L_M2_N15: +//| .L_M2_N8: +//| .L_M2_N7: +//| .L_M2_N4: +//| .L_M2_N3: +//| .L_M2_N2: +//| .L_M2_N1: +//.L_M1 +//| .L_M1_N16: +//| .L_M1_N15: +//| .L_M1_N8: +//| .L_M1_N7: +//| .L_M1_N4: +//| .L_M1_N3: +//| .L_M1_N2: +//| .L_M1_N1: +//.L_M0 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + PTR_SRAI T0, N, 0x04 + PTR_SRAI T1, N, 0x03 + PTR_SLLI T0, T0, 0x04 + PTR_SLLI T1, T1, 0x03 + + PTR_MUL P2, M, T0 + PTR_MUL P3, M, T1 + PTR_SLLI P2, P2, 0x02 + PTR_SLLI P3, P3, 0x02 + PTR_ADD P2, DST, P2 + PTR_ADD P3, DST, P3 + + PTR_SRAI T0, N, 0x02 + PTR_SRAI T1, N, 0x01 + PTR_SLLI T0, T0, 0x02 + PTR_SLLI T1, T1, 0x01 + PTR_MUL P4, M, T0 + PTR_MUL P5, M, T1 + PTR_SLLI P4, P4, 0x02 + PTR_SLLI P5, P5, 0x02 + PTR_ADD P4, DST, P4 + PTR_ADD P5, DST, P5 + + PTR_SLLI TL, LDA, 0x02 + PTR_SRAI J, M, 0x03 + PTR_SLLI T0, TL, 0x01 + PTR_SLLI T1, M, 0x06 + beq ZERO, J, .L_M7 +.align 5 +.L_M8: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S0, S7, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x200 + + PTR_SRAI I, N, 0x04 + PTR_ADDI J, J, -1 + beq ZERO, I, .L_N15 +.L_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI S3, S3, 0x40 + PTR_ADDI S4, S4, 0x40 + PTR_ADDI S5, S5, 0x40 + PTR_ADDI S6, S6, 0x40 + PTR_ADDI S7, S7, 0x40 + PTR_ADDI S8, S8, 0x40 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_N16 +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 +.L_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \ + U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI P2, P2, 0x100 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 +.L_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ + $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \ + $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI S5, S5, 0x10 + PTR_ADDI S6, S6, 0x10 + PTR_ADDI S7, S7, 0x10 + PTR_ADDI S8, S8, 0x10 + PTR_ADDI P3, P3, 0x80 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 +.L_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \ + $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI S5, S5, 0x08 + PTR_ADDI S6, S6, 0x08 + PTR_ADDI S7, S7, 0x08 + PTR_ADDI S8, S8, 0x08 + PTR_ADDI P4, P4, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \ + $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI P5, P5, 0x20 +.L_N0: + blt ZERO, J, .L_M8 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 +.L_M4: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S0, S3, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x100 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M4_N15 +.align 5 +.L_M4_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI S3, S3, 0x40 + PTR_ADDI S4, S4, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M4_N16 +.L_M4_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M4_N7 +.L_M4_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI P2, P2, 0x80 +.L_M4_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M4_N3 +.L_M4_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI P3, P3, 0x40 +.L_M4_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M4_N1 +.L_M4_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI P4, P4, 0x20 +.L_M4_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI P5, P5, 0x10 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 +.L_M2: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S0, S0, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x80 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M2_N15 +.align 5 +.L_M2_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M2_N16 +.L_M2_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M2_N7 +.L_M2_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI P2, P2, 0x40 +.L_M2_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M2_N3 +.L_M2_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI P3, P3, 0x20 +.L_M2_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M2_N1 +.L_M2_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI P4, P4, 0x10 +.L_M2_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI P5, P5, 0x08 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + PTR_ADD S2, S0, TL + + move P1, P0 + PTR_ADDI P0, P0, 0x40 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M1_N15 +.align 5 +.L_M1_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M1_N16 +.L_M1_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M1_N7 +.L_M1_N8: + xvld U0, S1, 0x00 + + GST xv, , U0, P2, 0x00 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI P2, P2, 0x20 +.L_M1_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M1_N3 +.L_M1_N4: + GLD v, , $vr0, S1, 0x00 + GST v, , $vr0, P3, 0x00 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI P3, P3, 0x10 +.L_M1_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M1_N1 +.L_M1_N2: + GLD f, d, $f0, S1, 0x00 + GST f, d, $f0, P4, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI P4, P4, 0x08 +.L_M1_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + GLD f, s, $f0, S1, 0x00 + GST f, s, $f0, P5, 0x00 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI P5, P5, 0x04 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S new file mode 100644 index 000000000..725a47a60 --- /dev/null +++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S @@ -0,0 +1,406 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define T0 $r27 +#define T1 $r28 +#define TL $r7 +#undef ZERO +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + +// Loops outline +//.L_M8 <------------------- +//| .L_N8: | +//| .L_N7: | Main Loop +//| .L_N4: | +//| .L_N3: | +//| .L_N2: | +//| .L_N1: | +//| .L_N0: --------------- +//.L_M7 +//.L_M4 +//| .L_M4_N8: +//| .L_M4_N7: +//| .L_M4_N4: +//| .L_M4_N3: +//| .L_M4_N2: +//| .L_M4_N1: +//.L_M3 +//.L_M2 +//| .L_M2_N8: +//| .L_M2_N7: +//| .L_M2_N4: +//| .L_M2_N3: +//| .L_M2_N2: +//| .L_M2_N1: +//.L_M1 +//| .L_M1_N8: +//| .L_M1_N7: +//| .L_M1_N4: +//| .L_M1_N3: +//| .L_M1_N2: +//| .L_M1_N1: +//.L_M0 + + PROLOGUE + push_if_used 23, 8 + + move S0, SRC + move P0, DST + + PTR_SRAI T0, N, 0x04 + PTR_SRAI T1, N, 0x03 + PTR_SLLI T0, T0, 0x04 + PTR_SLLI T1, T1, 0x03 + + PTR_MUL P2, M, T1 + PTR_SLLI P2, P2, 0x02 + PTR_ADD P2, DST, P2 + PTR_SRAI T0, N, 0x02 + PTR_SRAI T1, N, 0x01 + PTR_SLLI T0, T0, 0x02 + PTR_SLLI T1, T1, 0x01 + PTR_MUL P3, M, T0 + PTR_MUL P4, M, T1 + PTR_SLLI P3, P3, 0x02 + PTR_SLLI P4, P4, 0x02 + PTR_ADD P3, DST, P3 + PTR_ADD P4, DST, P4 + + PTR_SLLI TL, LDA, 0x02 + PTR_SRAI J, M, 0x03 + PTR_SLLI T0, TL, 0x01 + PTR_SLLI T1, M, 0x05 + beq ZERO, J, .L_M7 +.align 5 +.L_M8: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S0, S7, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x100 + + PTR_SRAI I, N, 0x03 + PTR_ADDI J, J, -1 + beq ZERO, I, .L_N7 +.L_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \ + U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_N8 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 +.L_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ + $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \ + $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI S5, S5, 0x10 + PTR_ADDI S6, S6, 0x10 + PTR_ADDI S7, S7, 0x10 + PTR_ADDI S8, S8, 0x10 + PTR_ADDI P2, P2, 0x80 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 +.L_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \ + $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI S5, S5, 0x08 + PTR_ADDI S6, S6, 0x08 + PTR_ADDI S7, S7, 0x08 + PTR_ADDI S8, S8, 0x08 + PTR_ADDI P3, P3, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \ + $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI P4, P4, 0x20 +.L_N0: + blt ZERO, J, .L_M8 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 +.L_M4: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S0, S3, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x80 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M4_N7 +.align 5 +.L_M4_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M4_N8 +.L_M4_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M4_N3 +.L_M4_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI P2, P2, 0x40 +.L_M4_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M4_N1 +.L_M4_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI P3, P3, 0x20 +.L_M4_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI P4, P4, 0x10 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 +.L_M2: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S0, S0, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x40 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M2_N7 +.align 5 +.L_M2_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M2_N8 +.L_M2_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M2_N3 +.L_M2_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI P2, P2, 0x20 +.L_M2_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M2_N1 +.L_M2_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI P3, P3, 0x10 +.L_M2_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI P4, P4, 0x08 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + PTR_ADD S2, S0, TL + + move P1, P0 + PTR_ADDI P0, P0, 0x20 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M1_N7 +.align 5 +.L_M1_N8: + xvld U0, S1, 0x00 + + GST xv, , U0, P1, 0x00 + + PTR_ADDI S1, S1, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M1_N8 +.L_M1_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M1_N3 +.L_M1_N4: + GLD v, , $vr0, S1, 0x00 + GST v, , $vr0, P2, 0x00 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI P2, P2, 0x10 +.L_M1_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M1_N1 +.L_M1_N2: + GLD f, d, $f0, S1, 0x00 + GST f, d, $f0, P3, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI P3, P3, 0x08 +.L_M1_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + GLD f, s, $f0, S1, 0x00 + GST f, s, $f0, P4, 0x00 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI P4, P4, 0x04 +.L_M0: + pop_if_used 23, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/param.h b/param.h index 547463b2f..03bf3624f 100644 --- a/param.h +++ b/param.h @@ -2848,34 +2848,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NO_LASX) #define DGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define SGEMM_DEFAULT_UNROLL_M 2 #else #define DGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define SGEMM_DEFAULT_UNROLL_M 16 #endif -#define SGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 -#define SGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 -#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_P 256 #define DGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_R 12288 +#define SGEMM_DEFAULT_R 1024 #define DGEMM_DEFAULT_R 858 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#define SGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 152 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 From e9f1b2d26f8c68c2bd1f108565645d72f55b7180 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Aug 2023 16:45:56 +0200 Subject: [PATCH 276/718] Expand the SVE compatibility check for the NVIDIA HPC compiler --- c_check | 3 +++ 1 file changed, 3 insertions(+) diff --git a/c_check b/c_check index 7ee183163..4d12c1674 100755 --- a/c_check +++ b/c_check @@ -283,6 +283,9 @@ if [ "$architecture" = "arm64" ]; then no_sve=0 { $compiler_name $flags $args >/dev/null 2>&1 + } || { + args=" -Msve_intrinsics -c -o $tmpf.o $tmpf" + $compiler_name $flags $args >/dev/null 2>&1 } || { no_sve=1 } From 8794544b4322d478608fbd742200b2d6aea12294 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Aug 2023 16:47:32 +0200 Subject: [PATCH 277/718] Add support for compiling the Neoverse SVE kernels with the NVIDIA HPC compiler --- kernel/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index 795f25eec..4bcb571a4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -77,6 +77,10 @@ else ifeq ($(TARGET_CORE), ZEN) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else ifeq ($(TARGET_CORE), LOONGSON3R4) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) +else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),) + ifeq ($(C_COMPILER), PGI) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics + endif else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif From 49689fbef7b929f0382322b0d21217837ae9b375 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Aug 2023 17:11:04 +0200 Subject: [PATCH 278/718] Add support for compiling SVE kernels with the NVIDIA HPC compiler --- cmake/cc.cmake | 32 ++++++++++++++++++++++---------- cmake/system.cmake | 12 ++++++++++++ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index aeaa76710..7b4ef8947 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -180,22 +180,30 @@ endif () if (${CORE} STREQUAL NEOVERSEN2) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") - endif() + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () endif () endif () if (${CORE} STREQUAL NEOVERSEV1) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") else () - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() endif() endif () endif () @@ -213,7 +221,11 @@ endif () if (${CORE} STREQUAL ARMV8SVE) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () endif () endif () diff --git a/cmake/system.cmake b/cmake/system.cmake index 414193ec8..bc87f7b44 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -282,23 +282,35 @@ if (DEFINED TARGET) endif() if (${TARGET} STREQUAL NEOVERSEV1) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") + else () execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") else () message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.") endif() + endif() endif() if (${TARGET} STREQUAL NEOVERSEN2) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + else () execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.") endif() + endif() endif() if (${TARGET} STREQUAL ARMV8SVE) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve") + else () set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") + endif() endif() endif() From 2c3034ff7f6e38dbb8c3fc7a59fbff9a58486d35 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Aug 2023 17:22:51 +0200 Subject: [PATCH 279/718] Disable the C/ZASUM AVX512 microkernels when compiling with LLVM17 as well --- kernel/x86_64/casum_microk_skylakex-2.c | 2 +- kernel/x86_64/zasum_microk_skylakex-2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index ac1dc6fa1..d261962de 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -2,7 +2,7 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)) +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) #if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index d4f35db63..dddf03fe2 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -2,7 +2,7 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)) +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) #if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) From 7f7d3896dd4012eb051d41b9ca47df6219e8a769 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Aug 2023 18:07:47 +0200 Subject: [PATCH 280/718] Fix missing type declaration for main --- lapack-netlib/INSTALL/lsametst.c | 2 +- lapack-netlib/INSTALL/secondtst.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/INSTALL/lsametst.c b/lapack-netlib/INSTALL/lsametst.c index 4b46115fc..631733841 100644 --- a/lapack-netlib/INSTALL/lsametst.c +++ b/lapack-netlib/INSTALL/lsametst.c @@ -426,7 +426,7 @@ static integer c__3 = 3; /* December 2016 */ /* ===================================================================== */ -/* Main program */ main(void) +/* Main program */ int main(void) { /* Format strings */ static char fmt_9999[] = "(\002 *** Error: LSAME( \002,a1,\002, \002," diff --git a/lapack-netlib/INSTALL/secondtst.c b/lapack-netlib/INSTALL/secondtst.c index 694679bb5..03e7814e9 100644 --- a/lapack-netlib/INSTALL/secondtst.c +++ b/lapack-netlib/INSTALL/secondtst.c @@ -422,7 +422,7 @@ static integer c__1000 = 1000; /* ===================================================================== */ -/* Main program */ main(void) +/* Main program */ int main(void) { /* Format strings */ static char fmt_9999[] = "(\002 Time for \002,g10.3,\002 SAXPY ops = " From 7a6203ffa15d962bcfde5c23f0f6ff03c1a7b60f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Aug 2023 18:25:51 +0200 Subject: [PATCH 281/718] restore default Neoverse SVE build instructions for non-NVIDIA compilers --- kernel/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index 4bcb571a4..1e0a0074f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -80,6 +80,8 @@ else ifeq ($(TARGET_CORE), LOONGSON3R4) else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),) ifeq ($(C_COMPILER), PGI) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics + else + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From fc8894dd98f734d48c55e155dbb692ce398c43f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Aug 2023 00:30:17 +0200 Subject: [PATCH 282/718] Workaround miscompilation by NVIDIA nvc --- kernel/arm64/zdot_thunderx2t99.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index 728f97fb3..6f65e5cfd 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -24,7 +24,12 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#if (NVCOMPVERS < 2309) +#pragma opt 1 +#endif +#endif #include "common.h" From 22a402bc2c09c64e15c72af3da90d116a015892d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Aug 2023 01:58:08 +0200 Subject: [PATCH 283/718] clarify the comment on the out-of-bounds check from #723 --- lapack/getf2/getf2_k.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c index 80c66dd7a..5795797d3 100644 --- a/lapack/getf2/getf2_k.c +++ b/lapack/getf2/getf2_k.c @@ -95,7 +95,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, GEMV_N(m - j, j, 0, dm1, a + j, lda, b, 1, b + j, 1, sb); jp = j + IAMAX_K(m - j, b + j, 1); - if (jp>m) jp = m; //avoid out of boundary + if (jp>m) jp = m; //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723 ipiv[j + offset] = jp + offset; jp--; temp1 = *(b + jp); From 3b6050ac0428893b3698e33f8348023f5c1bd09c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Aug 2023 02:00:00 +0200 Subject: [PATCH 284/718] clarify the comment on the out-of-bounds check from #723 --- lapack/getf2/zgetf2_k.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c index e3d53c96f..6a2137b3e 100644 --- a/lapack/getf2/zgetf2_k.c +++ b/lapack/getf2/zgetf2_k.c @@ -99,7 +99,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, GEMV_N(m - j, j, 0, dm1, ZERO, a + j * 2, lda, b, 1, b + j * 2, 1, sb); jp = j + IAMAX_K(m - j, b + j * 2, 1); - if (jp>m) jp = m; //avoid out of boundary + if (jp>m) jp = m; //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723 ipiv[j + offset] = jp + offset; jp--; From 394a1fd1bfacb7b968b0cacd877e2919a97d58e3 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 31 Aug 2023 15:44:22 +0800 Subject: [PATCH 285/718] LoongArch64: Compatible with early internal toolchain __loongarch_grlen and __loongarch_frlen were introduced in gcc version 8.3.0 (Loongnix 8.3.0-6.lnd.vec.31) internally within Loongson to standardize the general and floating-point register widths. However, previous versions did not have them, requiring additional checks to be added. --- kernel/loongarch64/dgemv_n_8_lasx.S | 12 +++++- kernel/loongarch64/dgemv_t_8_lasx.S | 23 ++++++++--- kernel/loongarch64/loongarch64_asm.S | 27 ++++++++++++- kernel/loongarch64/sgemm_kernel_16x8_lasx.S | 43 ++++++++++++++++----- 4 files changed, 86 insertions(+), 19 deletions(-) diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S index 940d27569..c6523f9ab 100644 --- a/kernel/loongarch64/dgemv_n_8_lasx.S +++ b/kernel/loongarch64/dgemv_n_8_lasx.S @@ -386,9 +386,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if __loongarch_grlen == 64 GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -#else +#elif __loongarch_grlen == 32 GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA #endif PTR_ALSL X, INC_X, X, 3 bnez J, .L_\XW\()_N_L8 @@ -435,6 +438,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_SUB K_LDA, K_LDA, M8 #if __loongarch_grlen == 64 GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA #else GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA #endif @@ -518,9 +523,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if __loongarch_grlen == 64 GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -#else +#elif __loongarch_grlen == 32 GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA #endif la.local T0, .L_GAP_TABLE PTR_ALSL I, I, T0, 1 diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S index be90cb1af..7f57c1d88 100644 --- a/kernel/loongarch64/dgemv_t_8_lasx.S +++ b/kernel/loongarch64/dgemv_t_8_lasx.S @@ -263,9 +263,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if __loongarch_grlen == 64 GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 -#else +#elif __loongarch_grlen == 32 GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#else + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 #endif GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 @@ -292,9 +295,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if __loongarch_grlen == 64 GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -#else +#elif __loongarch_grlen == 32 GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA #endif fst.d $f11, Y, 0x00 fstx.d $f12, Y, INC_Y @@ -353,8 +359,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if __loongarch_grlen == 64 GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -#else +#elif __loongarch_grlen == 32 GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA #endif fst.d $f11, Y, 0x00 fstx.d $f12, Y, INC_Y @@ -405,8 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if __loongarch_grlen == 64 GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA -#else +#elif __loongarch_grlen == 32 GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA #endif fst.d $f11, Y, 0x00 fstx.d $f12, Y, INC_Y @@ -446,9 +456,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if __loongarch_grlen == 64 GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -#else +#elif __loongarch_grlen == 32 GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA #endif la.local T0, .L_GAP_TABLE PTR_ALSL I, I, T0, 1 diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S index 89243c620..694dcdaa9 100644 --- a/kernel/loongarch64/loongarch64_asm.S +++ b/kernel/loongarch64/loongarch64_asm.S @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PTR_SRAI srai.d #define PTR_MUL mul.d #define PTR_ALSL alsl.d -#else +#elif __loongarch_grlen == 32 #define LA_REG int32_t #define REG_SIZE 4 #define REG_LOG 2 @@ -53,6 +53,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PTR_SRAI srai.w #define PTR_MUL mul.w #define PTR_ALSL alsl.w +#else +// If neither of the above two conditions is supported, it means this is an early +// internal toolchain. To ensure maximum compatibility, the following approach is taken: +#define LA_REG int64_t +#define REG_SIZE 8 +#define REG_LOG 3 +#define PTR_ADDI addi.d +#define PTR_ADD add.d +#define PTR_SUB sub.d +#define PTR_LD ld.d +#define PTR_ST st.d +#define PTR_SLLI slli.d +#define PTR_SRLI srli.d +#define PTR_SRAI srai.d +#define PTR_MUL mul.d +#define PTR_ALSL alsl.d #endif #if __loongarch_frlen == 64 @@ -60,11 +76,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FREG_LOG 3 #define PTR_FLD fld.d #define PTR_FST fst.d -#else +#elif __loongarch_frlen == 32 #define FREG_SIZE 4 #define FREG_LOG 2 #define PTR_FLD fld.s #define PTR_FST fst.s +#else +// If neither of the above two conditions is supported, it means this is an early +// internal toolchain. To ensure maximum compatibility, the following approach is taken: +#define FREG_SIZE 8 +#define FREG_LOG 3 +#define PTR_FLD fld.d +#define PTR_FST fst.d #endif // The max registers available to the user which diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S index 254dbe052..bd609394e 100644 --- a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S +++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S @@ -335,9 +335,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if __loongarch_grlen == 64 GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 -#else +#elif __loongarch_grlen == 32 GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 +#else + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ + C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 #endif .endm @@ -445,9 +448,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if __loongarch_grlen == 64 GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride -#else +#elif __loongarch_grlen == 32 GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride +#else + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ + C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride #endif .endm @@ -505,8 +511,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. D6, C3, 0x00, D7, C3, 0x20 #if __loongarch_grlen == 64 GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 -#else +#elif __loongarch_grlen == 32 GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 +#else + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 #endif .endm @@ -585,8 +593,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endif #if __loongarch_grlen == 64 GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride -#else +#elif __loongarch_grlen == 32 GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride +#else + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride #endif .endm @@ -631,8 +641,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. D2, C1, 0x00, D3, C1, 0x20 #if __loongarch_grlen == 64 GADDI , d, C0, C0, 0x40, C1, C1, 0x40 -#else +#elif __loongarch_grlen == 32 GADDI , w, C0, C0, 0x40, C1, C1, 0x40 +#else + GADDI , d, C0, C0, 0x40, C1, C1, 0x40 #endif .endm @@ -703,8 +715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endif #if __loongarch_grlen == 64 GADDI , d, C0, C0, \stride, C1, C1, \stride -#else +#elif __loongarch_grlen == 32 GADDI , w, C0, C0, \stride, C1, C1, \stride +#else + GADDI , d, C0, C0, \stride, C1, C1, \stride #endif .endm @@ -741,8 +755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GST xv, , D0, C0, 0x00, D1, C0, 0x20 #if __loongarch_grlen == 64 GADDI , d, C0, C0, 0x40 -#else +#elif __loongarch_grlen == 32 GADDI , w, C0, C0, 0x40 +#else + GADDI , d, C0, C0, 0x40 #endif .endm @@ -813,8 +829,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endif #if __loongarch_grlen == 64 GADDI , d, C0, C0, \stride -#else +#elif __loongarch_grlen == 32 GADDI , w, C0, C0, \stride +#else + GADDI , d, C0, C0, \stride #endif .endm @@ -838,9 +856,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if __loongarch_grlen == 64 GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ C6, C5, T0, C7, C6, T0 -#else +#elif __loongarch_grlen == 32 GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ C6, C5, T0, C7, C6, T0 +#else + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ + C6, C5, T0, C7, C6, T0 #endif #if defined(TRMMKERNEL) && defined(LEFT) move OFF, OFFSET @@ -1222,8 +1243,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PTR_SLLI T0, LDC, 2 #if __loongarch_grlen == 64 GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 -#else +#elif __loongarch_grlen == 32 GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0 +#else + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 #endif #if defined(TRMMKERNEL) && defined(LEFT) From a2a184572ce12d5e361a974328281d5243323de8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Aug 2023 23:42:12 +0200 Subject: [PATCH 286/718] update zrotg --- interface/zrotg.c | 271 ++++++++++++++++++++++++---------------------- 1 file changed, 143 insertions(+), 128 deletions(-) diff --git a/interface/zrotg.c b/interface/zrotg.c index dd765f05f..af6f85c1c 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -18,20 +18,26 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { #ifdef DOUBLE long double safmin = DBL_MIN; + long double rtmin = sqrt(DBL_MIN/DBL_EPSILON); #else long double safmin = FLT_MIN; + long double rtmin = sqrt(FLT_MIN/FLT_EPSILON); #endif -#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) - long double da_r = *(DA + 0); - long double da_i = *(DA + 1); - long double db_r = *(DB + 0); - long double db_i = *(DB + 1); - long double r; + FLOAT da_r = *(DA+0); + FLOAT da_i = *(DA+1); + FLOAT db_r = *(DB+0); + FLOAT db_i = *(DB+1); + //long double r; + FLOAT *r, *S1=(FLOAT *)malloc(2*sizeof(FLOAT)); + FLOAT *R=(FLOAT *)malloc(2*sizeof(FLOAT)); + long double d; - long double ada = fabsl(da_r) + fabsl(da_i); - long double adb = sqrt(db_r * db_r + db_i * db_i); + FLOAT ada = da_r * da_r + da_i * da_i; + FLOAT adb = db_r * db_r + db_i * db_i; + FLOAT adart = sqrt( da_r * da_r + da_i * da_i); + FLOAT adbrt = sqrt( db_r * db_r + db_i * db_i); PRINT_DEBUG_NAME; @@ -39,128 +45,137 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { FUNCTION_PROFILE_START(); - if (ada == ZERO) { - *C = ZERO; - *(S + 0) = ONE; + if (db_r == ZERO && db_i == ZERO) { + *C = ONE; + *(S + 0) = ZERO; *(S + 1) = ZERO; - *(DA + 0) = db_r; - *(DA + 1) = db_i; - } else { - long double alpha_r, alpha_i; - long double safmax = 1./safmin; - long double sigma; - long double maxab = MAX(ada,adb); - long double scale = MIN(MAX(safmin,maxab), safmax); - - - long double aa_r = da_r / scale; - long double aa_i = da_i / scale; - long double bb_r = db_r / scale; - long double bb_i = db_i / scale; - - if (ada > adb) - sigma = copysign(1.,da_r); - else - sigma = copysign(1.,db_r); - - r = sigma * scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i); - - - alpha_r = da_r / ada; - alpha_i = da_i / ada; - - *(C + 0) = ada / r; - *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; - *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; - *(DA + 0) = alpha_r * r; - *(DA + 1) = alpha_i * r; + return; } -#else - FLOAT da_r = *(DA + 0); - FLOAT da_i = *(DA + 1); - FLOAT db_r = *(DB + 0); - FLOAT db_i = *(DB + 1); - FLOAT r; - - FLOAT ada = fabs(da_r) + fabs(da_i); - FLOAT adb = fabs(db_r) + fabs(db_i); - - PRINT_DEBUG_NAME; - - IDEBUG_START; - - FUNCTION_PROFILE_START(); - - if (ada == ZERO) { - *C = ZERO; - *(S + 0) = ONE; - *(S + 1) = ZERO; - *(DA + 0) = db_r; - *(DA + 1) = db_i; - } else { - long double safmax = 1./safmin; - FLOAT scale, sigma; - FLOAT aa_r, aa_i, bb_r, bb_i; - FLOAT alpha_r, alpha_i; - - aa_r = fabs(da_r); - aa_i = fabs(da_i); - - if (aa_i > aa_r) { - aa_r = fabs(da_i); - aa_i = fabs(da_r); - } - - if (aa_r == ZERO) { - ada = 0.; - } else { - scale = (aa_i / aa_r); - ada = aa_r * sqrt(ONE + scale * scale); - } - - bb_r = fabs(db_r); - bb_i = fabs(db_i); - - if (bb_i > bb_r) { - bb_r = fabs(bb_i); - bb_i = fabs(bb_r); - } - - if (bb_r == ZERO) { - adb = 0.; - } else { - scale = (bb_i / bb_r); - adb = bb_r * sqrt(ONE + scale * scale); - } - FLOAT maxab = MAX(ada,adb); - scale = MIN(MAX(safmin,maxab), safmax); - - aa_r = da_r / scale; - aa_i = da_i / scale; - bb_r = db_r / scale; - bb_i = db_i / scale; - if (ada > adb) - sigma = copysign(1.,da_r); - else - sigma = copysign(1.,db_r); - - r = sigma * scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i); - - alpha_r = da_r / ada; - alpha_i = da_i / ada; - - *(C + 0) = ada / r; - *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; - *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; - *(DA + 0) = alpha_r * r; - *(DA + 1) = alpha_i * r; - } + long double safmax = 1./safmin; +#if defined DOUBLE + long double rtmax = safmax /DBL_EPSILON; +#else + long double rtmax = safmax /FLT_EPSILON; #endif - - FUNCTION_PROFILE_END(4, 4, 4); - - IDEBUG_END; - - return; + *(S1 + 0) = *(DB + 0); + *(S1 + 1) = *(DB + 1) *-1; + if (da_r == ZERO && da_i == ZERO) { + *C = ZERO; + if (db_r == ZERO) { + (*DA) = fabsl(db_i); + *S = *S1 /da_r; + *(S+1) = *(S1+1) /da_r; + return; + } else if ( db_i == ZERO) { + *DA = fabsl(db_r); + *S = *S1 /da_r; + *(S+1) = *(S1+1) /da_r; + return; + } else { + long double g1 = MAX( fabsl(db_r), fabsl(db_i)); + rtmax =sqrt(safmax/2.); + if (g1 > rtmin && g1 < rtmax) { // unscaled + d = sqrt(adb); + *S = *S1 /d; + *(S+1) = *(S1+1) /d; + *DA = d ; + *(DA+1) = ZERO; + return; + } else { // scaled algorithm + long double u = MIN ( safmax, MAX ( safmin, g1)); + FLOAT gs_r = db_r/u; + FLOAT gs_i = db_i/u; + d = sqrt ( gs_r*gs_r + gs_i*gs_i); + *S = gs_r / d; + *(S + 1) = (gs_i * -1) / d; + *DA = d * u; + *(DA+1) = ZERO; + return; + } + } + } else { + FLOAT f1 = MAX ( fabsl(da_r), fabsl(da_i)); + FLOAT g1 = MAX ( fabsl(db_r), fabsl(db_i)); + rtmax = sqrt(safmax / 4.); + if ( f1 > rtmin && f1 < rtmax && g1 > rtmin && g1 < rtmax) { //unscaled + long double h = ada + adb; + double adahsq = sqrt(ada * h); + if (ada >= h *safmin) { + *C = sqrt(ada/h); + *R = *DA / *C; + *(R+1) = *(DA+1) / *(C+1); + rtmax *= 2.; + if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow + *S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq); + *(S+1) = *S1 * (*(DA+1) / adahsq) + *(S1+1) * (*DA/adahsq); + } else { + *S = *S1 * (*R/h) - *(S1+1) * (*(R+1)/h); + *(S+1) = *S1 * (*(R+1)/h) + *(S1+1) * (*(R)/h); + } + } else { + *C = ada / adahsq; + if (*C >= safmin) + *R = *DA / *C; + else + *R = *DA * (h / adahsq); + *S = *S1 * ada / adahsq; + *(S+1) = *(S1+1) * ada / adahsq; + } + *DA=*R; + *(DA+1)=*(R+1); + return; + } else { // scaled + FLOAT fs_r, fs_i, gs_r, gs_i; + long double v,w,f2,g2,h; + long double u = MIN ( safmax, MAX ( safmin, MAX(f1,g1))); + gs_r = db_r/u; + gs_i = db_i/u; + g2 = sqrt ( gs_r*gs_r + gs_i*gs_i); + if (f1 /u < rtmin) { + v = MIN (safmax, MAX (safmin, f1)); + w = v / u; + fs_r = *DA/ v; + fs_i = *(DA+1) / v; + f2 = sqrt ( fs_r*fs_r + fs_i*fs_i); + h = f2 * w * w + g2; + } else { // use same scaling for both + w = 1.; + fs_r = *DA/ u; + fs_i = *(DA+1) / u; + f2 = sqrt ( fs_r*fs_r + fs_i*fs_i); + h = f2 + g2; + } + if ( f2 >= h * safmin) { + *C = sqrt ( f2 / h ); + *DA = fs_r / *C; + *(DA+1) = fs_i / *C; + rtmax *= 2; + if ( f2 > rtmin && h < rtmax) { + *S = gs_r * (fs_r /sqrt(f2*h)) - gs_i * (fs_i / sqrt(f2*h)); + *(S+1) = gs_r * (fs_i /sqrt(f2*h)) + gs_i * -1. * (fs_r / sqrt(f2*h)); + } else { + *S = gs_r * (*DA/h) - gs_i * (*(DA+1) / h); + *(S+1) = gs_r * (*(DA+1) /h) + gs_i * -1. * (*DA / h); + } + } else { // intermediates might overflow + d = sqrt ( f2 * h); + *C = f2 /d; + if (*C >= safmin) { + *DA = fs_r / *C; + *(DA+1) = fs_i / *C; + } else { + *DA = fs_r * (h / d); + *(DA+1) = fs_i / (h / d); + } + *S = gs_r * (fs_r /d) - gs_i * (fs_i / d); + *(S+1) = gs_r * (fs_i /d) + gs_i * -1. * (fs_r / d); + } + *C *= w; + *DA *= u; + *(DA+1) *= u; + return; + } + } } + \ No newline at end of file From 5d7304106888d6fe9411f8e7013207708ae7277f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Sep 2023 19:05:53 +0200 Subject: [PATCH 287/718] Update Changelog for 0.3.24 --- Changelog.txt | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index aa445ae82..3937ef08c 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,104 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.24 + 03-Sep-2023 + +general: + - declared the arguments of cblas_xerbla as const (in accordance with the reference implementation + and others, the previous discrepancy appears to have dated back to GotoBLAS) + - fixed the implementation of ?GEMMT that was added in 0.3.23 + - made cpu-specific SWITCH_RATIO parameters for GEMM available to DYNAMIC_ARCH builds + - fixed application of SYMBOLSUFFIX in CMAKE builds + - fixed missing SSYCONVF function in the shared library + - fixed parallel build logic used with gmake + - added support for compilation with LLVM17, in particular its new Fortran compiler + - added support for CMAKE builds using the NVIDIA HPC compiler + - fixed INTERFACE64 builds with CMAKE and the f95 Fortran compiler + - fixed cross-build detection and management in c_check + - disabled building of the tests with CMAKE when ONLY_CBLAS is defined + - fixed several issues with the handling of runtime limits on the number of OPENMP threads + - corrected the error code returned by SGEADD/DGEADD when LDA is too small + - corrected the error code returned by IMATCOPY when LDB is too small + - updated ?NRM2 to support negative increment values (as introduced in release 3.10 + of the reference BLAS) + - fixed OpenMP builds with CLANG for the case where libomp is not in a standard location + - fixed a potential overwrite of unrelated memory during thread initialisation on startup + - fixed a potential integer overflow in the multithreading threshold for ?SYMM/?SYRK + - fixed build of the LAPACKE interfaces for the LAPACK 3.11.0 ?TRSYL functions added in 0.3.22 + - fixed installation of .cmake files in concurrent 32 and 64bit builds with CMAKE + - applied additions and corrections from the development branch of Reference-LAPACK: + - fixed actual arguments passed to a number of LAPACK functions (from Reference-LAPACK PR 885) + - fixed workspace query results in LAPACK ?SYTRF/?TRECV3 (from Reference-LAPACK PR 883) + - fixed derivation of the UPLO parameter in LAPACKE_?larfb (from Reference-LAPACK PR 878) + - fixed a crash in LAPACK ?GELSDD on NRHS=0 (from Reference-LAPACK PR 876) + - added new LAPACK utility functions CRSCL and ZRSCL (from Reference-LAPACK PR 839) + - corrected the order of eigenvalues for 2x2 matrices in ?STEMR (Reference-LAPACK PR 867) + - removed spurious reference to OpenMP variables outside OpenMP contexts (Reference-LAPACK PR 860) + - updated file comments on use of LAMBDA variable in LAPACK (Reference-LAPACK PR 852) + - fixed documentation of LAPACK SLASD0/DLASD0 (Reference-LAPACK PR 855) + - fixed confusing use of "minor" in LAPACK documentation (Reference-LAPACK PR 849) + - added new LAPACK functions ?GEDMD for dynamic mode decomposition (Reference-LAPACK PR 736) + - fixed potential stack overflows in the EIG part of the LAPACK testsuite (Reference-LAPACK PR 854) + - applied small improvements to the variants of Cholesky and QR functions (Reference-LAPACK PR 847) + - removed unused variables from LAPACK ?BDSQR (Reference-LAPACK PR 832) + - fixed a potential crash on allocation failure in LAPACKE SGEESX/DGEESX (Reference-LAPACK PR 836) + - added a quick return from SLARUV/DLARUV for N < 1 (Reference-LAPACK PR 837) + - updated function descriptions in LAPACK ?GEGS/?GEGV (Reference-LAPACK PR 831) + - improved algorithm description in ?GELSY (Reference-LAPACK PR 833) + - fixed scaling in LAPACK STGSNA/DTGSNA (Reference-LAPACK PR 830) + - fixed crash in LAPACKE_?geqrt with row-major data (Reference-LAPACK PR 768) + - added LAPACKE interfaces for C/ZUNHR_COL and S/DORHR_COL (Reference-LAPACK PR 827) + - added error exit tests for SYSV/SYTD2/GEHD2 to the testsuite (Reference-LAPACK PR 795) + - fixed typos in LAPACK source and comments (Reference-LAPACK PRs 809,811,812,814,820) + - adopt refactored ?GEBAL implementation (Reference-LAPACK PR 808) + +x86_64: + - added cpu model autodetection for Intel Alder Lake N + - added activation of the AMX tile to the Sapphire Rapids SBGEMM kernel + - worked around miscompilations of GEMV/SYMV kernels by gcc's tree-vectorizer + - fixed compilation of Cooperlake and Sapphire Rapids kernels with CLANG + - fixed runtime detection of Cooperlake and Sapphire Rapids in DYNAMIC_ARCH + - fixed feature-based cputype fallback in DYNAMIC_ARCH + - added support for building the AVX512 kernels with the NVIDIA HPC compiler + - corrected ZAXPY result on old pre-AVX hardware for the INCX=0 case + - fixed a potential use of uninitialized variables in ZTRSM + +ARM64: + - added cpu model autodetection for Apple M2 + - fixed wrong results of CGEMM/CTRMM/DNRM2 under OSX (use of reserved register) + - added support for building the SVE kernels with the NVIDIA HPC compiler + - added support for building the SVE kernels with the Apple Clang compiler + - fixed compiler option handling for building the SVE kernels with LLVM + - implemented SWITCH_RATIO parameter for improved GEMM performance on Neoverse + - activated SVE SGEMM and DGEMM kernels for Neoverse V1 + - improved performance of the SVE CGEMM and ZGEMM kernels on Neoverse V1 + - improved kernel selection for the ARMV8SVE target and added it to DYNAMIC_ARCH + - fixed runtime check for SVE availability in DYNAMIC_ARCH builds to take OS or + container restrictions into account + - fixed a potential use of uninitialized variables in ZTRSM + - fix a potential misdetection of ARMV8 hardware as 32bit in CMAKE builds + +LOONGARCH64: + - added ABI detection + - added support for cpu affinity handling + - fixed compilation with early versions of the Loongson toolchain + - added an optimized SGEMM kernel for 3A5000 + - added optimized DGEMV kernels for 3A5000 + - improved the performance of the DGEMM kernel for 3A5000 + +MIPS64: + - fixed miscompilation of TRMM kernels for the MIPS64_GENERIC target + +POWER: + - fixed compiler warnings in the POWER10 SBGEMM kernel + +RISCV: + - fixed application of the INTERFACE64 option when building with CMAKE + - fix a potential misdetection of RISCV hardware as 32bit in CMAKE builds + - fixed IDAMAX and DOT kernels for C910V + - fixed corner cases in the ROT and SWAP kernels for C910V + - fixed compilation of the C910V target with recent vendor compilers + ==================================================================== Version 0.3.23 01-Apr-2023 From 3c49711f1e6a3ab1ddd6f3147806e53358541247 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Sep 2023 22:57:22 +0200 Subject: [PATCH 288/718] Update version to 0.3.24 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 86758d8b6..444baa114 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 23.dev) +set(OpenBLAS_PATCH_VERSION 24) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 9f815cf1bf16b4e64d4aee681b33558fc090b62a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Sep 2023 22:58:32 +0200 Subject: [PATCH 289/718] Update version to 0.3.24 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index e210e49e8..80060a0fc 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.23.dev +VERSION = 0.3.24 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From f2cf9293744c4a17e04cc1690a419ef641fa4c21 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 31 Aug 2023 16:59:37 +0800 Subject: [PATCH 290/718] LoongArch64: Add sgemv kernel --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/sgemv_n_8_lasx.S | 463 ++++++++++++++++++++++++++ kernel/loongarch64/sgemv_t_8_lasx.S | 405 ++++++++++++++++++++++ 3 files changed, 871 insertions(+) create mode 100644 kernel/loongarch64/sgemv_n_8_lasx.S create mode 100644 kernel/loongarch64/sgemv_t_8_lasx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 67d1fd11c..c23c2fac5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -21,6 +21,9 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SGEMVNKERNEL = sgemv_n_8_lasx.S +SGEMVTKERNEL = sgemv_t_8_lasx.S endif DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S new file mode 100644 index 000000000..da172ca50 --- /dev/null +++ b/kernel/loongarch64/sgemv_n_8_lasx.S @@ -0,0 +1,463 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/30 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M4 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define A0 $xr11 +#define A1 $xr12 +#define A2 $xr13 +#define A3 $xr14 +#define A4 $xr15 +#define A5 $xr16 +#define A6 $xr17 +#define A7 $xr18 + +#define X0_F $f2 +#define X1_F $f3 +#define X2_F $f4 +#define X3_F $f5 +#define X4_F $f6 +#define X5_F $f7 +#define X6_F $f8 +#define X7_F $f9 +#define Y0_F $f10 +#define A0_F $f11 +#define A1_F $f12 +#define A2_F $f13 +#define A3_F $f14 +#define A4_F $f15 +#define A5_F $f16 +#define A6_F $f17 +#define A7_F $f18 + +.macro SLOAD_X_8 + GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C, \ + X4, X, 0x10, X5, X, 0x14, X6, X, 0x18, X7, X, 0x1C + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro SLOAD_X_8_GAP + xvldrepl.w X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.w X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X3, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X4, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X5, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X6, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X7, T0, 0x00 + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro SLOAD_X_4 + GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro SLOAD_X_4_GAP + xvldrepl.w X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.w X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.w X3, T0, 0x00 + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro SLOAD_X_2 + GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04 + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro SLOAD_X_2_GAP + xvldrepl.w X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.w X1, T0, 0x00 + GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro SLOAD_X_1 + GLDREPL xv, w, X0, X, 0x00 + GMUL xvf, s, X0, X0, VALPHA +.endm + +.macro SLOAD_Y_8 + GLD xv, , Y0, Y, 0 +.endm + +.macro SLOAD_Y_8_GAP + fld.s Y0_F, Y, 0 + fldx.s A0_F, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.s A1_F, T0, 0 + fldx.s A2_F, T0, INC_Y + PTR_ALSL T0, INC_Y, Y, 2 + fld.s A3_F, T0, 0 + fldx.s A4_F, T0, INC_Y + PTR_ADD T0, T0, INC_Y + PTR_ADD T0, T0, INC_Y + fld.s A5_F, T0, 0 + fldx.s A6_F, T0, INC_Y + GINSVE0 xv, w, Y0, A0, 1, Y0, A1, 2, Y0, A2, 3, Y0, A3, 4, \ + Y0, A4, 5, Y0, A5, 6, Y0, A6, 7 +.endm + +.macro SLOAD_Y_1 + GLD f, s, Y0_F, Y, 0 +.endm + +.macro SGEMV_N_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0, \ + A2, PA2, 0, A3, PA3, 0, \ + A4, PA4, 0, A5, PA5, 0, \ + A6, PA6, 0, A7, PA7, 0 + GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \ + Y0, A2, X2, Y0, Y0, A3, X3, Y0, \ + Y0, A4, X4, Y0, Y0, A5, X5, Y0, \ + Y0, A6, X6, Y0, Y0, A7, X7, Y0 +.endm + +.macro SGEMV_N_1x8 + GLD_INC f, s, 0x04, \ + A0_F, PA0, 0, A1_F, PA1, 0, \ + A2_F, PA2, 0, A3_F, PA3, 0, \ + A4_F, PA4, 0, A5_F, PA5, 0, \ + A6_F, PA6, 0, A7_F, PA7, 0 + GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \ + Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F, \ + Y0_F, A4_F, X4_F, Y0_F, Y0_F, A5_F, X5_F, Y0_F, \ + Y0_F, A6_F, X6_F, Y0_F, Y0_F, A7_F, X7_F, Y0_F +.endm + +.macro SGEMV_N_8x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0, \ + A2, PA2, 0, A3, PA3, 0 + GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \ + Y0, A2, X2, Y0, Y0, A3, X3, Y0 +.endm + +.macro SGEMV_N_1x4 + GLD_INC f, s, 0x04, \ + A0_F, PA0, 0, A1_F, PA1, 0, \ + A2_F, PA2, 0, A3_F, PA3, 0 + GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \ + Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F +.endm + +.macro SGEMV_N_8x2 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0 + GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0 +.endm + +.macro SGEMV_N_1x2 + GLD_INC f, s, 0x04, \ + A0_F, PA0, 0, A1_F, PA1, 0 + GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F +.endm + +.macro SGEMV_N_1x1 + GLD_INC f, s, 0x04, A0_F, PA0, 0 + GMADD f, s, Y0_F, A0_F, X0_F, Y0_F +.endm + +.macro SSTORE_Y_8 + GST xv, , Y0, Y, 0 +.endm + +.macro SSTORE_Y_8_GAP + xvstelm.w Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.w Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 3 + + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 4 + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 5 + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 6 + PTR_ADD T0, T0, INC_Y + xvstelm.w Y0, T0, 0, 7 +.endm + +.macro SSTORE_Y_1 + GST f, s, Y0_F, Y, 0 +.endm + +.macro SGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M4 +.L_\XW\()_N_L8: + SLOAD_\X_8 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + SLOAD_\Y_8 + SGEMV_N_8x8 + SSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 3 + PTR_ADDI K, K, 8 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 7 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + SLOAD_\Y_1 + SGEMV_N_1x8 + SSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + PTR_ALSL X, INC_X, X, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + SLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + SLOAD_\Y_8 + SGEMV_N_8x4 + SSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + andi I, M, 7 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + SLOAD_\Y_1 + SGEMV_N_1x4 + SSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M4 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + SLOAD_\X_2 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + SLOAD_\Y_8 + SGEMV_N_8x2 + SSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + andi I, M, 7 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + SLOAD_\Y_1 + SGEMV_N_1x2 + SSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M4 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD PA1, PA1, K_LDA + PTR_ALSL X, INC_X, X, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + SLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + SLOAD_\Y_1 + SGEMV_N_1x1 + SSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 19 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2 + xvreplve0.w VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + SGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + SGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + SGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + SGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 19 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S new file mode 100644 index 000000000..dde3f4a30 --- /dev/null +++ b/kernel/loongarch64/sgemv_t_8_lasx.S @@ -0,0 +1,405 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/30 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M4 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define A0 $xr2 +#define A1 $xr3 +#define A2 $xr4 +#define A3 $xr5 +#define A4 $xr6 +#define A5 $xr7 +#define A6 $xr8 +#define A7 $xr9 +#define TP0 $xr10 +#define TP1 $xr11 +#define TP2 $xr12 +#define TP3 $xr13 +#define TP4 $xr14 +#define TP5 $xr15 +#define TP6 $xr16 +#define TP7 $xr17 +#define Y0 $xr2 +#define Y1 $xr3 +#define Y2 $xr4 +#define Y3 $xr5 +#define Y4 $xr6 +#define Y5 $xr7 +#define Y6 $xr8 +#define Y7 $xr9 + +.macro ZERO_Y8 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ + TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 +.endm + +.macro ZERO_Y4 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y2 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro SLOAD_X8 + GLD xv, , X0, X, 0x00 +.endm + +.macro SLOAD_X8_GAP + fld.s $f1, X, 0x00 + fldx.s $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.s $f3, T0, 0x00 + fldx.s $f4, T0, INC_X + GINSVE0 xv, w, X0, A0, 1, X0, A1, 2, X0, A2, 3 + PTR_ALSL T0, INC_X, X, 2 + fld.s $f2, T0, 0x00 + fldx.s $f3, T0, INC_X + PTR_ALSL T0, INC_X, T0, 1 + fld.s $f4, T0, 0x00 + fldx.s $f5, T0, INC_X + GINSVE0 xv, w, X0, A0, 4, X0, A1, 5, X0, A2, 6, X0, A3, 7 +.endm + +.macro SGEMV_T_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0, \ + A2, PA2, 0, A3, PA3, 0, \ + A4, PA4, 0, A5, PA5, 0, \ + A6, PA6, 0, A7, PA7, 0 + GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \ + TP2, A2, X0, TP2, TP3, A3, X0, TP3, \ + TP4, A4, X0, TP4, TP5, A5, X0, TP5, \ + TP6, A6, X0, TP6, TP7, A7, X0, TP7 +.endm + +.macro SGEMV_T_4x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0, \ + A2, PA2, 0, A3, PA3, 0 + GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \ + TP2, A2, X0, TP2, TP3, A3, X0, TP3 +.endm + +.macro SGEMV_T_2x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA1, 0 + GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1 +.endm + +.macro SGEMV_T XW:req X8:req, X4:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M4 +.L_\XW\()_N_L8: + ZERO_Y8 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + SLOAD_\X8 + SGEMV_T_8x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + // Accumulated + GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ + Y5, TP5, Y6, TP6, Y7, TP7 + andi I, M, 7 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.s $f1, X, 0x00 + fld.s $f10, PA0, 0x00 + fld.s $f11, PA1, 0x00 + fld.s $f12, PA2, 0x00 + fld.s $f13, PA3, 0x00 + fld.s $f14, PA4, 0x00 + fld.s $f15, PA5, 0x00 + fld.s $f16, PA6, 0x00 + fld.s $f17, PA7, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ + PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ + PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 +#else + GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ + PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 +#endif + GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, \ + $f6, $f14, $f1, $f6, $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.s $f10, Y, 0x00 + fldx.s $f11, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.s $f12, PY0, 0x00 + fldx.s $f13, PY0, INC_Y + PTR_ALSL PY1, INC_Y, Y, 2 + fld.s $f14, PY1, 0x00 + fldx.s $f15, PY1, INC_Y + PTR_ALSL PY2, INC_Y, PY1, 1 + fld.s $f16, PY2, 0x00 + fldx.s $f17, PY2, INC_Y + + GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, \ + $f14, ALPHA, $f6, $f14, $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + fst.s $f10, Y, 0x00 + fstx.s $f11, Y, INC_Y + fst.s $f12, PY0, 0x00 + fstx.s $f13, PY0, INC_Y + fst.s $f14, PY1, 0x00 + fstx.s $f15, PY1, INC_Y + fst.s $f16, PY2, 0x00 + fstx.s $f17, PY2, INC_Y + + PTR_ALSL Y, INC_Y, Y, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + SLOAD_\X8 + SGEMV_T_4x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + // Accumulated + GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 7 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + fld.s $f1, X, 0x00 + GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00, $f12, PA2, 0x00, $f13, PA3, 0x00 + GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + fld.s $f10, Y, 0x00 + fldx.s $f11, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.s $f12, PY0, 0x00 + fldx.s $f13, PY0, INC_Y + + GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13 + + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M4 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + fst.s $f10, Y, 0x00 + fstx.s $f11, Y, INC_Y + fst.s $f12, PY0, 0x00 + fstx.s $f13, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + ZERO_Y2 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + SLOAD_\X8 + SGEMV_T_2x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + // Accumulated + GACC xvf, s, Y0, TP0, Y1, TP1 + andi I, M, 7 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + fld.s $f1, X, 0x00 + GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00 + GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + fld.s $f10, Y, 0x00 + fldx.s $f11, Y, INC_Y + + GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11 + + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M4 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + fst.s $f10, Y, 0x00 + fstx.s $f11, Y, INC_Y + PTR_ALSL Y, INC_Y, Y, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.s $f2, PA0, 0x00 + fld.s $f1, X, 0x00 + fmadd.s $f10, $f2, $f1, $f10 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x04 + bnez I, .L_\XW\()_N_1_M_L1 + + fld.s $f2, Y, 0x00 + fmadd.s $f2, ALPHA, $f10, $f2 + fst.s $f2, Y, 0x00 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 18 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2 + xvreplve0.w VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + SGEMV_T GAP_0, X8, X4 +.L_GAP_1: /* if (incx != 1) */ + SGEMV_T GAP_1, X8_GAP, X4_GAP +.L_END: + pop_if_used 17 + 8, 18 + jirl $r0, $r1, 0x0 + EPILOGUE From 4867cf5dd78c47fff3cc18a49267575903651a1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Sep 2023 08:39:40 +0200 Subject: [PATCH 291/718] Update version to 0.3.24.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 444baa114..35077f3c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 24) +set(OpenBLAS_PATCH_VERSION 24.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From c3f2a3c0ca8d4a3700f81f176dd3f1bc08cff3dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Sep 2023 08:40:25 +0200 Subject: [PATCH 292/718] Update version to 0.3.24.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 80060a0fc..707924904 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.24 +VERSION = 0.3.24.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 09911f077e62818109b3a393ca4c2fdf9a2112f7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Sep 2023 16:33:40 +0200 Subject: [PATCH 293/718] Disable SVE targets for DYNAMIC_ARCH when compiling with (homebrew)gcc on macOS/arm64 --- Makefile.system | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.system b/Makefile.system index b3968d739..5a4af9698 100644 --- a/Makefile.system +++ b/Makefile.system @@ -397,6 +397,9 @@ ifeq ($(OSNAME), Darwin) ifndef MACOSX_DEPLOYMENT_TARGET ifeq ($(ARCH), arm64) export MACOSX_DEPLOYMENT_TARGET=11.0 +ifeq ($(C_COMPILER), GCC) +export NO_SVE = 1 +endif else export MACOSX_DEPLOYMENT_TARGET=10.8 endif From 6a611db560089b11d2b786179cf26443171798fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tiziano=20M=C3=BCller?= Date: Sun, 10 Sep 2023 08:44:07 +0200 Subject: [PATCH 294/718] memory: show correct number of max threads --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 4fceae754..fb7f36cdc 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -3012,7 +3012,7 @@ void *blas_memory_alloc(int procpos){ if (memory_overflowed) goto terminate; fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n"); - fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", NUM_BUFFERS); + fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", MAX_CPU_NUMBER); memory_overflowed=1; new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); From fb97cc4d5e81d788ba2c99b016cb3b7628e64229 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Sep 2023 10:46:23 +0200 Subject: [PATCH 295/718] Add la_constants.o to SCLAUX/DZLAUX --- lapack-netlib/SRC/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index c75fd5f49..26314c4df 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -85,7 +85,7 @@ ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \ ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" "" -SCLAUX = \ +SCLAUX = la_constants.o \ sbdsvdx.o sstevx.o sstein.o \ sbdsdc.o \ sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ @@ -106,7 +106,7 @@ SCLAUX = \ endif ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" -DZLAUX = \ +DZLAUX = la_constants.o\ dcombssq.o \ dbdsvdx.o dstevx.o dstein.o \ dbdsdc.o \ From 7779bb6fb1938dad2961f819cc13cdb4316442da Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Sep 2023 20:21:06 +0200 Subject: [PATCH 296/718] Make IWORK array larger to avoid overflow --- lapack-netlib/SRC/dtgex2.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/dtgex2.f b/lapack-netlib/SRC/dtgex2.f index 00c23a833..1ccd92c8f 100644 --- a/lapack-netlib/SRC/dtgex2.f +++ b/lapack-netlib/SRC/dtgex2.f @@ -254,7 +254,7 @@ $ THRESHA, THRESHB * .. * .. Local Arrays .. - INTEGER IWORK( LDST ) + INTEGER IWORK( LDST + 2 ) DOUBLE PRECISION AI( 2 ), AR( 2 ), BE( 2 ), IR( LDST, LDST ), $ IRCOP( LDST, LDST ), LI( LDST, LDST ), $ LICOP( LDST, LDST ), S( LDST, LDST ), From 1285b53e398ec9cbab79368f762062f5154ed383 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Sep 2023 20:22:11 +0200 Subject: [PATCH 297/718] Make IWORK array larger to avoid overflow --- lapack-netlib/SRC/stgex2.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/stgex2.f b/lapack-netlib/SRC/stgex2.f index d61da2327..885739ab5 100644 --- a/lapack-netlib/SRC/stgex2.f +++ b/lapack-netlib/SRC/stgex2.f @@ -255,7 +255,7 @@ $ THRESHA, THRESHB * .. * .. Local Arrays .. - INTEGER IWORK( LDST ) + INTEGER IWORK( LDST + 2 ) REAL AI( 2 ), AR( 2 ), BE( 2 ), IR( LDST, LDST ), $ IRCOP( LDST, LDST ), LI( LDST, LDST ), $ LICOP( LDST, LDST ), S( LDST, LDST ), From 7e939fb8312d512c7e3948d7977d0af4ba660371 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Sep 2023 23:33:39 +0200 Subject: [PATCH 298/718] Fix handling of additional buffer structures in case of overflow --- driver/others/memory.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index fb7f36cdc..b27fec431 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,6 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#define NEW_BUFFERS 512 #ifndef likely #ifdef __GNUC__ #define likely(x) __builtin_expect(!!(x), 1) @@ -2897,7 +2898,7 @@ void *blas_memory_alloc(int procpos){ #endif position ++; - } while (position < 512+NUM_BUFFERS); + } while (position < NEW_BUFFERS + NUM_BUFFERS); } #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -3014,9 +3015,10 @@ void *blas_memory_alloc(int procpos){ fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n"); fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", MAX_CPU_NUMBER); memory_overflowed=1; - new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); - newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); - for (i = 0; i < 512; i++) { + MB; + new_release_info = (struct release_t*) malloc(NEW_BUFFERS * sizeof(struct release_t)); + newmemory = (struct newmemstruct*) malloc(NEW_BUFFERS * sizeof(struct newmemstruct)); + for (i = 0; i < NEW_BUFFERS; i++) { newmemory[i].addr = (void *)0; #if defined(WHEREAMI) && !defined(USE_OPENMP) newmemory[i].pos = -1; @@ -3129,12 +3131,12 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) { - while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) + while ((position < NUM_BUFFERS+NEW_BUFFERS) && (newmemory[position-NUM_BUFFERS].addr != free_area)) position++; // arm: ensure all writes are finished before other thread takes this memory WMB; - - newmemory[position].used = 0; +if (position - NUM_BUFFERS >= NEW_BUFFERS) goto error; + newmemory[position-NUM_BUFFERS].used = 0; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif @@ -3213,7 +3215,7 @@ void blas_shutdown(void){ memory[pos].lock = 0; } if (memory_overflowed) - for (pos = 0; pos < 512; pos ++){ + for (pos = 0; pos < NEW_BUFFERS; pos ++){ newmemory[pos].addr = (void *)0; newmemory[pos].used = 0; #if defined(WHEREAMI) && !defined(USE_OPENMP) From 6876ae0c3b77b3fb08185e8619801343e4388868 Mon Sep 17 00:00:00 2001 From: Angelika Schwarz <17718454+angsch@users.noreply.github.com> Date: Wed, 20 Sep 2023 19:10:08 +0200 Subject: [PATCH 299/718] Fix division by zero in zrotg The cases [ c s ] * [ 0 ] = [ |db_i| ] [-s c ] [ i*db_i ] [ 0 ] and [ c s ] * [ 0 ] = [ |db_r| ] [-s c ] [ db_r ] [ 0 ] computed s incorrectly. To flip the entries of vector, s should be conjg(db)/|db| and not conjg(db) / da, where da == 0.0. --- interface/zrotg.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/interface/zrotg.c b/interface/zrotg.c index af6f85c1c..4d2a9d510 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -61,16 +61,16 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { *(S1 + 0) = *(DB + 0); *(S1 + 1) = *(DB + 1) *-1; if (da_r == ZERO && da_i == ZERO) { - *C = ZERO; + *C = ZERO; if (db_r == ZERO) { (*DA) = fabsl(db_i); - *S = *S1 /da_r; - *(S+1) = *(S1+1) /da_r; + *S = *S1 /(*DA); + *(S+1) = *(S1+1) /(*DA); return; } else if ( db_i == ZERO) { *DA = fabsl(db_r); - *S = *S1 /da_r; - *(S+1) = *(S1+1) /da_r; + *S = *S1 /(*DA); + *(S+1) = *(S1+1) /(*DA); return; } else { long double g1 = MAX( fabsl(db_r), fabsl(db_i)); From db3a43c8edeb36ecc9e7cde10b1c06be3f2147fc Mon Sep 17 00:00:00 2001 From: Angelika Schwarz <17718454+angsch@users.noreply.github.com> Date: Wed, 20 Sep 2023 19:42:13 +0200 Subject: [PATCH 300/718] Simplify rotg * The check da != ZERO is no longer necessary since there is a special case ada == ZERO, where ada = |da|. * Add the missing check c != ZERO before the division. Note that with these two changes the long double code follows the float/double version of the code. --- interface/rotg.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/interface/rotg.c b/interface/rotg.c index 8d40d9c53..423ebda21 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -66,13 +66,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ c = da / r; s = db / r; z = ONE; - if (da != ZERO) { - if (ada > adb){ - z = s; - } else { - z = ONE / c; - } - } + if (ada > adb) z = s; + if ((ada <= adb) && (c != ZERO)) z = ONE / c; *C = c; *S = s; From 44e6e5479b87f697b4d4fc92030c162f2451b384 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 21 Sep 2023 23:01:21 +0200 Subject: [PATCH 301/718] Use the C compiler for the C SBGEMM test source --- test/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Makefile b/test/Makefile index 46a7b1158..fa054f15b 100644 --- a/test/Makefile +++ b/test/Makefile @@ -326,7 +326,7 @@ endif ifeq ($(BUILD_BFLOAT16),1) test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + $(CC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif ifeq ($(BUILD_COMPLEX),1) From 2390e0bfbc203f5566b0fede523b1caf1c344deb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 21 Sep 2023 23:04:25 +0200 Subject: [PATCH 302/718] Quote the BU (underscore) option as it may not be set --- exports/Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index d81735342..7682f851d 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -272,23 +272,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) aix.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) objconv.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c + ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed From b926e70ebd879bb022d265a3859bfb5481b4d99f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 21 Sep 2023 23:07:32 +0200 Subject: [PATCH 303/718] Fix typo in build rule of "profiled" sbgemm --- interface/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/Makefile b/interface/Makefile index 2ac9663d6..78335357b 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1301,7 +1301,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) -sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h +sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) endif From bb4718322294bd9f28b0343a643d0986e9046a2b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 24 Sep 2023 10:13:47 +0200 Subject: [PATCH 304/718] Force -qextname for trailing underscore generation when IBM xlf is used with gcc --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 5a4af9698..ae6db40b0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1167,6 +1167,10 @@ endif ifeq ($(F_COMPILER), IBM) CCOMMON_OPT += -DF_INTERFACE_IBM +FEXTRALIB += -lxlf90 +ifeq ($(C_COMPILER), GCC) +FCOMMON_OPT += -qextname +endif # FCOMMON_OPT += -qarch=440 ifdef BINARY64 FCOMMON_OPT += -q64 From 8012afcabbc912e32961924a77858e334ee75356 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 24 Sep 2023 10:15:12 +0200 Subject: [PATCH 305/718] Avoid using some gcc-specific flags with IBM xlf --- Makefile.power | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Makefile.power b/Makefile.power index 28a0bae08..33702c932 100644 --- a/Makefile.power +++ b/Makefile.power @@ -42,15 +42,14 @@ FCOMMON_OPT += -O2 -qrecur -qnosave else FCOMMON_OPT += -O2 -frecursive -fno-fast-math endif -ifeq ($(C_COMPILER), GCC) + +ifeq ($(F_COMPILER), GFORTRAN) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) FCOMMON_OPT += -mcpu=power8 -mtune=power8 else FCOMMON_OPT += -mcpu=power9 -mtune=power9 endif -else -FCOMMON_OPT += -mcpu=power9 -mtune=power9 endif else FCOMMON_OPT += -O2 -Mrecursive @@ -84,12 +83,16 @@ CCOMMON_OPT += -DUSE_OPENMP -fopenmp else CCOMMON_OPT += -DUSE_OPENMP -mp endif +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -DUSE_OPENMP +else ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -DUSE_OPENMP -fopenmp else FCOMMON_OPT += -DUSE_OPENMP -mp endif endif +endif # workaround for C->FORTRAN ABI violation in LAPACKE ifeq ($(F_COMPILER), GFORTRAN) From 4de963dc17eb682e774a85a494a28001d6e6aa98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 24 Sep 2023 10:16:37 +0200 Subject: [PATCH 306/718] Enforce trailing underscores on symbols when IBM xlf is combined with gcc --- f_check | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/f_check b/f_check index 526c41dc6..f30231bc4 100755 --- a/f_check +++ b/f_check @@ -155,6 +155,10 @@ else *'IBM XL'*) vendor=IBM openmp='-openmp' + case "$CC" in *gcc*) + bu=_ + ;; + esac ;; *NAG*) vendor=NAG @@ -223,6 +227,10 @@ else *ppuf*|*xlf*) vendor=IBM openmp='-openmp' + case "$CC" in *gcc*) + bu=_ + ;; + esac ;; *open64*) vendor=OPEN64 From 7a96908d0cb0ee3cc5b49390a5ec0ca3a71fefdf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 24 Sep 2023 10:18:24 +0200 Subject: [PATCH 307/718] Add -lgomp when IBM xlf is combined with gcc in OPENMP builds --- ctest/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ctest/Makefile b/ctest/Makefile index 9e85d23b9..af5b34a36 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -214,6 +214,11 @@ endif ifeq ($(F_COMPILER), NAG) CEXTRALIB = -lgomp endif +ifeq ($(F_COMPILER), IBM) +ifeq ($(C_COMPILER), GCC) +CEXTRALIB += -lgomp +endif +endif endif ifeq ($(BUILD_SINGLE),1) From 2a9981a2442106f67d963ee68cb4ee3b1a7a0334 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 24 Sep 2023 10:19:11 +0200 Subject: [PATCH 308/718] Add -lgomp when IBM xlf is combined with gcc in OPENMP builds --- test/Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/Makefile b/test/Makefile index fa054f15b..715842b4d 100644 --- a/test/Makefile +++ b/test/Makefile @@ -271,6 +271,11 @@ endif ifeq ($(F_COMPILER), NAG) CEXTRALIB = -lgomp endif +ifeq ($(F_COMPILER), IBM) +ifeq ($(C_COMPILER), GCC) +CEXTRALIB += -lgomp +endif +endif endif ifeq ($(BUILD_SINGLE),1) @@ -326,7 +331,7 @@ endif ifeq ($(BUILD_BFLOAT16),1) test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) - $(CC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + $(CC) $(CFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif ifeq ($(BUILD_COMPLEX),1) From 4670eb1462b73ffa82699a70bc383bfd41461a0f Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 20 Sep 2023 09:09:35 +0800 Subject: [PATCH 309/718] LoongArch64: Add dtrsm kernel --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 10 +- .../loongarch64/dtrsm_kernel_LN_16x4_lasx.S | 1366 +++++++++++ .../loongarch64/dtrsm_kernel_LT_16x4_lasx.S | 959 ++++++++ .../loongarch64/dtrsm_kernel_RN_16x4_lasx.S | 882 +++++++ .../loongarch64/dtrsm_kernel_RT_16x4_lasx.S | 953 ++++++++ kernel/loongarch64/dtrsm_kernel_macro.S | 2147 +++++++++++++++++ 6 files changed, 6312 insertions(+), 5 deletions(-) create mode 100644 kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S create mode 100644 kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S create mode 100644 kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S create mode 100644 kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S create mode 100644 kernel/loongarch64/dtrsm_kernel_macro.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index c23c2fac5..011e8b89e 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -24,12 +24,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S -endif -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S +DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S +DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S +endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S new file mode 100644 index 000000000..3315daccb --- /dev/null +++ b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S @@ -0,0 +1,1366 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/07/26 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + * FLOAT *c, BLASLONG ldc, BLASLONG offset) + */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc +#define OFFSET $r11 // param 9: offset + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 +#define T1 $r26 +#define T2 $r27 +#define KK $r28 +#define AA $r29 +#define CC $r30 +#undef ZERO +#define ZERO $r0 + +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +#include "dtrsm_kernel_macro.S" + +// By integrating the dgemm and dsolve processes, the following advantages can be obtained: +// 1. Avoid the overhead of function calls (by not invoking dgemm_kernel) +// 2. Reduce the storage and retrieval of C data +// 3. Vectorization of dsolve +// GEMM_UNROLL_M x DGEMM_UNROLL_N is 16x4, which is a fairly large size. +// To achieve finer-grained optimization, 15 scenarios have been addressed: +// 16x4, 16x2, 16x1, 8x4, 8x2, 8x1, 4x4, 4x2, 4x1, 2x4, 2x2, 2x1, 1x4, 1x2, 1x1. + +.macro dsolve_16 N +// if N = 4 the data layout of C is as follows: +// U0 U1 U2 U3 +// U4 U5 U6 U7 +// U8 U9 U10 U11 +// U12 U13 U14 U15 +// if N = 2 the dat layout of C is as follows: +// U0 U1 U2 U3 +// U4 U5 U6 U7 +// if N = 1 the dat layout of C is as follows: +// U0 U1 U2 U3 +// The matrix A has dimensions of 16x16, and +// it will be divided into 4 segments for processing. + +#define G12 U3 +#define G13 U7 +#define G14 U11 +#define G15 U15 + GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1 + // A + // G12 G13 G14 G15 + // ----------------- + // 204 | D9 + // 220 221 | D8 D7 + // 236 237 238 | D6 D5 D4 + // 252 253 254 255 | D3 D2 D1 D0 + PTR_ADDI T0, A0, 252 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 236 * 8 + GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 + PTR_ADDI T0, A0, 220 * 8 + GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 + PTR_ADDI T0, A0, 204 * 8 + GLDREPL xv, d, D9, T0, 0 + + xvfmul.d G15, G15, D0 + GNMSUB xvf, d, G14, G15, D1, G14 + xvfmul.d G14, G14, D4 + GNMSUB xvf, d, G13, G15, D2, G13, G13, G14, D5, G13 + xvfmul.d G13, G13, D7 + GNMSUB xvf, d, G12, G15, D3, G12, G12, G14, D6, G12, G12, G13, D8, G12 + xvfmul.d G12, G12, D9 + // Store B +.if \N == 4 + // x x x x ... x x x x + // x x x x ... x x x x + // x x x x ... x x x x + // b48 b49 b50 b51 ... b60 b61 b62 b63 + GST xv, , G12, B0, 48 * 8, G13, B0, 52 * 8, G14, B0, 56 * 8, G15, B0, 60 * 8 +.elseif \N == 2 + // x x x x ... x x x x + // x x x x ... x x x x + // x x x x ... x x x x + // b24 b25 b26 b27 b28 b29 b30 b31 + GST v, , $vr3, B0, 24 * 8, $vr7, B0, 26 * 8, $vr11, B0, 28 * 8, $vr15, B0, 30 * 8 +.elseif \N == 1 + // x x x x + // x x x x + // x x x x + // b12 b13 b14 b15 + GST f, d, $f3, B0, 12 * 8, $f7, B0, 13 * 8, $f11, B0, 14 * 8, $f15, B0, 15 * 8 +.endif + // Transpose G15 G14 G13 G12 + GTRANSPOSE4x4_D G12, G13, G14, G15, D0, D1, D2, D3, D4, D5 + // Store C +.if \N == 4 + // x x x x ... c12 c13 c14 c15 + // x x x x ... c28 c29 c30 c31 + // x x x x ... c44 c45 c46 c47 + // x x x x ... c60 c61 c62 c63 + GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8, D2, C2, 12 * 8, D3, C3, 12 * 8 +.elseif \N == 2 + // x x x x ... c12 c13 c14 c15 + // x x x x ... c28 c29 c30 c31 + GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8 +.elseif \N == 1 + // Store C + // x x x x ... c12 c13 c14 c15 + GST xv, , D0, C0, 12 * 8 +.endif + +#define G8 U2 +#define G9 U6 +#define G10 U10 +#define G11 U14 + GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1 + // A + // G8 G9 G10 G11 + // ----------------- + // 136 | D9 + // 152 153 | D8 D7 + // 168 169 170 | D6 D5 D4 + // 184 185 186 187 | D3 D2 D1 D0 + // 200 201 202 203 | D15 D14 D13 D12 + // 216 217 218 219 | D11 D10 D9 D8 + // 232 233 234 235 | D7 D6 D5 D4 + // 248 249 250 251 | D3 D2 D1 D0 + PTR_ADDI T0, A0, 248 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 232 * 8 + GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 + PTR_ADDI T0, A0, 216 * 8 + GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 + PTR_ADDI T0, A0, 200 * 8 + GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 + GNMSUB xvf, d, G11, G15, D0, G11, G10, G15, D1, G10, G9, G15, D2, G9, G8, G15, D3, G8, \ + G11, G14, D4, G11, G10, G14, D5, G10, G9, G14, D6, G9, G8, G14, D7, G8, \ + G11, G13, D8, G11, G10, G13, D9, G10, G9, G13, D10, G9, G8, G13, D11, G8, \ + G11, G12, D12, G11, G10, G12, D13, G10, G9, G12, D14, G9, G8, G12, D15, G8 + PTR_ADDI T0, A0, 184 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 168 * 8 + GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 + PTR_ADDI T0, A0, 152 * 8 + GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 + PTR_ADDI T0, A0, 136 * 8 + GLDREPL xv, d, D9, T0, 0 + + xvfmul.d G11, G11, D0 + GNMSUB xvf, d, G10, G11, D1, G10, G9, G11, D2, G9, G8, G11, D3, G8 + xvfmul.d G10, G10, D4 + GNMSUB xvf, d, G9, G10, D5, G9, G8, G10, D6, G8 + xvfmul.d G9, G9, D7 + GNMSUB xvf, d, G8, G9, D8, G8 + xvfmul.d G8, G8, D9 + // Store B +.if \N == 4 + // x x x x ... x x x x + // x x x x ... x x x x + // b32 b33 b34 b34 ... b44 b45 b46 b47 + // b48 b49 b50 b51 ... b60 b61 b62 b63 + GST xv, , G8, B0, 32 * 8, G9, B0, 36 * 8, G10, B0, 40 * 8, G11, B0, 44 * 8 +.elseif \N == 2 + // x x x x ... x x x x + // x x x x ... x x x x + // b16 b17 b18 b19 b20 b21 b22 b23 + // b24 b25 b26 b27 b28 b29 b30 b31 + GST v, , $vr2, B0, 16 * 8, $vr6, B0, 18 * 8, $vr10, B0, 20 * 8, $vr14, B0, 22 * 8 +.elseif \N == 1 + // x x x x + // x x x x + // b8 b9 b10 b11 + // b12 b13 b14 b15 + GST f, d, $f2, B0, 8 * 8, $f6, B0, 9 * 8, $f10, B0, 10 * 8, $f14, B0, 11 * 8 +.endif + // Transpose G11 G10 G9 G8 + GTRANSPOSE4x4_D G8, G9, G10, G11, D0, D1, D2, D3, D4, D5 + // Store C +.if \N == 4 + // x x x x ... c8 c9 c10 c11 c12 c13 c14 c15 + // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31 + // x x x x ... c40 c41 c42 c43 c44 c45 c46 c47 + // x x x x ... c56 c57 c58 c59 c60 c61 c62 c63 + GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8, D2, C2, 8 * 8, D3, C3, 8 * 8 +.elseif \N == 2 + // x x x x ... c8 c9 c10 c11 c12 c13 c14 c15 + // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31 + GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8 +.elseif \N == 1 + // x x x x ... c8 c9 c10 c11 c12 c13 c14 c15 + GST xv, , D0, C0, 8 * 8 +.endif + +#define G4 U1 +#define G5 U5 +#define G6 U9 +#define G7 U13 + GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, D0, D1 + // A + // G4 G5 G6 G7 + // ------------------ + // 68 | D9 + // 84 85 | D8 D7 + // 100 101 102 | D6 D5 D4 + // 116 117 118 119 | D3 D2 D1 D0 + // 132 133 134 135 | D15 D14 D13 D12 + // 148 149 150 151 | D11 D10 D9 D8 + // 164 165 166 167 | D7 D6 D5 D4 + // 180 181 182 183 | D3 D2 D1 D0 + // 196 197 198 199 | D15 D14 D13 D12 + // 212 213 214 215 | D11 D10 D9 D8 + // 228 229 230 231 | D7 D6 D5 D4 + // 244 245 246 247 | D3 D2 D1 D0 + PTR_ADDI T0, A0, 244 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 228 * 8 + GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 + PTR_ADDI T0, A0, 212 * 8 + GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 + PTR_ADDI T0, A0, 196 * 8 + GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 + GNMSUB xvf, d, G7, G15, D0, G7, G6, G15, D1, G6, G5, G15, D2, G5, G4, G15, D3, G4, \ + G7, G14, D4, G7, G6, G14, D5, G6, G5, G14, D6, G5, G4, G14, D7, G4, \ + G7, G13, D8, G7, G6, G13, D9, G6, G5, G13, D10, G5, G4, G13, D11, G4, \ + G7, G12, D12, G7, G6, G12, D13, G6, G5, G12, D14, G5, G4, G12, D15, G4 + PTR_ADDI T0, A0, 180 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 164 * 8 + GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 + PTR_ADDI T0, A0, 148 * 8 + GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 + PTR_ADDI T0, A0, 132 * 8 + GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 + GNMSUB xvf, d, G7, G11, D0, G7, G6, G11, D1, G6, G5, G11, D2, G5, G4, G11, D3, G4, \ + G7, G10, D4, G7, G6, G10, D5, G6, G5, G10, D6, G5, G4, G10, D7, G4, \ + G7, G9, D8, G7, G6, G9, D9, G6, G5, G9, D10, G5, G4, G9, D11, G4, \ + G7, G8, D12, G7, G6, G8, D13, G6, G5, G8, D14, G5, G4, G8, D15, G4 + PTR_ADDI T0, A0, 116 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 100 * 8 + GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 + PTR_ADDI T0, A0, 84 * 8 + GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 + PTR_ADDI T0, A0, 68 * 8 + GLDREPL xv, d, D9, T0, 0 + xvfmul.d G7, G7, D0 + GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4 + xvfmul.d G6, G6, D4 + GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4 + xvfmul.d G5, G5, D7 + GNMSUB xvf, d, G4, G5, D8, G4 + xvfmul.d G4, G4, D9 + // Store B +.if \N == 4 + // x x x x ... x x x x + // b16 b17 b18 b19 ... b28 b29 b30 b31 + // b32 b33 b34 b34 ... b44 b45 b46 b47 + // b48 b49 b50 b51 ... b60 b61 b62 b63 + GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8 +.elseif \N == 2 + // x x x x ... x x x x + // b8 b9 b10 b11 b12 b13 b14 b15 + // b16 b17 b18 b19 b20 b21 b22 b23 + // b24 b25 b26 b27 b28 b29 b30 b31 + GST v, , $vr1, B0, 8 * 8, $vr5, B0, 10 * 8, $vr9, B0, 12 * 8, $vr13, B0, 14 * 8 +.elseif \N == 1 + // x x x x + // b4 b5 b6 b7 + // b8 b9 b10 b11 + // b12 b13 b14 b15 + GST f, d, $f1, B0, 4 * 8, $f5, B0, 5 * 8, $f9, B0, 6 * 8, $f13, B0, 7 * 8 +.endif + // Transpose G7 G6 G5 G4 + GTRANSPOSE4x4_D G4, G5, G6, G7, D0, D1, D2, D3, D4, D5 + // Store C +.if \N == 4 + // x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 + // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31 + // x x x x c36 c37 c38 c39 c40 c41 c42 c43 c44 c45 c46 c47 + // x x x x c52 c53 c54 c55 c56 c57 c58 c59 c60 c61 c62 c63 + GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8, D2, C2, 4 * 8, D3, C3, 4 * 8 +.elseif \N == 2 + // x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 + // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31 + GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8 +.elseif \N == 1 + // x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 + GST xv, , D0, C0, 4 * 8 +.endif + +#define G0 U0 +#define G1 U4 +#define G2 U8 +#define G3 U12 + GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, D0, D1 + // A + // G0 G1 G2 G3 + // ------------------ + // 0 | D9 + // 16 17 | D8 D7 + // 32 33 34 | D6 D5 D4 + // 48 49 50 51 | D3 D2 D1 D0 + // 64 65 66 67 | D15 D14 D13 D12 + // 80 81 82 83 | D11 D10 D9 D8 + // 96 97 98 99 | D7 D6 D5 D4 + // 112 113 114 115 | D3 D2 D1 D0 + // 128 129 130 131 | D15 D14 D13 D12 + // 144 145 146 147 | D11 D10 D9 D8 + // 160 161 162 163 | D7 D6 D5 D4 + // 176 177 178 179 | D3 D2 D1 D0 + // 192 193 194 195 | D15 D14 D13 D12 + // 208 209 210 211 | D11 D10 D9 D8 + // 224 225 226 227 | D7 D6 D5 D4 + // 240 241 242 243 | D3 D2 D1 D0 + PTR_ADDI T0, A0, 240 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 224 * 8 + GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 + PTR_ADDI T0, A0, 208 * 8 + GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 + PTR_ADDI T0, A0, 192 * 8 + GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 + GNMSUB xvf, d, G3, G15, D0, G3, G2, G15, D1, G2, G1, G15, D2, G1, G0, G15, D3, G0, \ + G3, G14, D4, G3, G2, G14, D5, G2, G1, G14, D6, G1, G0, G14, D7, G0, \ + G3, G13, D8, G3, G2, G13, D9, G2, G1, G13, D10, G1, G0, G13, D11, G0, \ + G3, G12, D12, G3, G2, G12, D13, G2, G1, G12, D14, G1, G0, G12, D15, G0 + PTR_ADDI T0, A0, 176 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 160 * 8 + GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 + PTR_ADDI T0, A0, 144 * 8 + GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 + PTR_ADDI T0, A0, 128 * 8 + GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 + GNMSUB xvf, d, G3, G11, D0, G3, G2, G11, D1, G2, G1, G11, D2, G1, G0, G11, D3, G0, \ + G3, G10, D4, G3, G2, G10, D5, G2, G1, G10, D6, G1, G0, G10, D7, G0, \ + G3, G9, D8, G3, G2, G9, D9, G2, G1, G9, D10, G1, G0, G9, D11, G0, \ + G3, G8, D12, G3, G2, G8, D13, G2, G1, G8, D14, G1, G0, G8, D15, G0 + PTR_ADDI T0, A0, 112 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 96 * 8 + GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 + PTR_ADDI T0, A0, 80 * 8 + GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 + PTR_ADDI T0, A0, 64 * 8 + GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 + GNMSUB xvf, d, G3, G7, D0, G3, G2, G7, D1, G2, G1, G7, D2, G1, G0, G7, D3, G0, \ + G3, G6, D4, G3, G2, G6, D5, G2, G1, G6, D6, G1, G0, G6, D7, G0, \ + G3, G5, D8, G3, G2, G5, D9, G2, G1, G5, D10, G1, G0, G5, D11, G0, \ + G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0 + PTR_ADDI T0, A0, 48 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 32 * 8 + GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 + PTR_ADDI T0, A0, 16 * 8 + GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 + PTR_ADDI T0, A0, 0 * 8 + GLDREPL xv, d, D9, T0, 0 + + xvfmul.d G3, G3, D0 + GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0 + xvfmul.d G2, G2, D4 + GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0 + xvfmul.d G1, G1, D7 + GNMSUB xvf, d, G0, G1, D8, G0 + xvfmul.d G0, G0, D9 + // Store B +.if \N == 4 + // b0 b1 b2 b3 ... b12 b13 b14 b15 + // b16 b17 b18 b19 ... b28 b29 b30 b31 + // b32 b33 b34 b34 ... b44 b45 b46 b47 + // b48 b49 b50 b51 ... b60 b61 b62 b63 + GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8 +.elseif \N == 2 + // b0 b1 b2 b3 b4 b5 b6 b7 + // b8 b9 b10 b11 b12 b13 b14 b15 + // b16 b17 b18 b19 b20 b21 b22 b23 + // b24 b25 b26 b27 b28 b29 b30 b31 + GST v, , $vr0, B0, 0, $vr4, B0, 2 * 8, $vr8, B0, 4 * 8, $vr12, B0, 6 * 8 +.elseif \N == 1 + // b0 b1 b2 b3 + // b4 b5 b6 b7 + // b8 b9 b10 b11 + // b12 b13 b14 b15 + GST f, d, $f0, B0, 0, $f4, B0, 1 * 8, $f8, B0, 2 * 8, $f12, B0, 3 * 8 +.endif + // Transpose C3 C2 C1 C0 + GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5 + // Store C +.if \N == 4 + // c0 c1 c2 c3 ... c12 c13 c14 c15 + // c16 c17 c18 c19 ... c28 c29 c30 c31 + // c32 c33 c34 c34 ... c44 c45 c46 c47 + // c48 c49 c50 c51 ... c60 c61 c62 c63 + GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0 +.elseif \N == 2 + // c0 c1 c2 c3 ... c12 c13 c14 c15 + // c16 c17 c18 c19 ... c28 c29 c30 c31 + GST xv, , D0, C0, 0, D1, C1, 0 +.elseif \N == 1 + // c0 c1 c2 c3 ... c12 c13 c14 c15 + GST xv, , D0, C0, 0 +.endif + +#undef G0 +#undef G1 +#undef G2 +#undef G3 +#undef G4 +#undef G5 +#undef G6 +#undef G7 +#undef G8 +#undef G9 +#undef G10 +#undef G11 +#undef G12 +#undef G13 +#undef G14 +#undef G15 +.endm + +.macro dsolve_8 N +// if N = 4 the data layout of C is as follows: +// U0 U1 +// U2 U3 +// U4 U5 +// U6 U7 +// if N = 2 the dat layout of C is as follows: +// U0 U1 +// U2 U3 +// if N = 1 the dat layout of C is as follows: +// U0 U1 +// The matrix A has dimensions of 8x8, and +// it will be divided into 2 segments for processing. + +#define G4 U1 +#define G5 U3 +#define G6 U5 +#define G7 U7 + // Transpose U7 U5 U3 U1 + GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, D0, D1 + // A + // G4 G5 G6 G7 + // --------------- + // 36 | D9 + // 44 45 | D8 D7 + // 52 53 54 | D6 D5 D4 + // 60 61 62 63 | D3 D2 D1 D0 + PTR_ADDI T0, A0, 60 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 52 * 8 + GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 + PTR_ADDI T0, A0, 44 * 8 + GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 + PTR_ADDI T0, A0, 36 * 8 + GLDREPL xv, d, D9, T0, 0 + + xvfmul.d G7, G7, D0 + GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4 + xvfmul.d G6, G6, D4 + GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4 + xvfmul.d G5, G5, D7 + GNMSUB xvf, d, G4, G5, D8, G4 + xvfmul.d G4, G4, D9 + // Store B +.if \N == 4 + GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8 +.elseif \N == 2 + GST v, , $vr1, B0, 8 * 8, $vr3, B0, 10 * 8, $vr5, B0, 12 * 8, $vr7, B0, 14 * 8 +.elseif \N == 1 + GST f, d, $f1, B0, 4 * 8, $f3, B0, 5 * 8, $f5, B0, 6 * 8, $f7, B0, 7 * 8 +.endif + // Transpose + GTRANSPOSE4x4_D G4, G5, G6, G7, D4, D5, D6, D7, D8, D9 + // Store C +.if \N == 4 + GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8, D6, C2, 4 * 8, D7, C3, 4 * 8 +.elseif \N == 2 + GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8 +.elseif \N == 1 + GST xv, , D4, C0, 4 * 8 +.endif + +#define G0 U0 +#define G1 U2 +#define G2 U4 +#define G3 U6 + // Transpose U6 U4 U2 U0 + GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, D0, D1 + // A + // G0 G1 G2 G3 + //----------------- + // 0 | D9 + // 8 9 | D8 D7 + // 16 17 18 | D6 D5 D4 + // 24 25 26 27 | D3 D2 D1 D0 + // 32 33 34 35 | D15 D14 D13 D12 + // 40 41 42 43 | D11 D10 D9 D8 + // 48 49 50 51 | D7 D6 D5 D4 + // 56 57 58 59 | D3 D2 D1 D0 + PTR_ADDI T0, A0, 56 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 48 * 8 + GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 + PTR_ADDI T0, A0, 40 * 8 + GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 + PTR_ADDI T0, A0, 32 * 8 + GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 + GNMSUB xvf, d, G3, G7, D0, G3, G2, G7, D1, G2, G1, G7, D2, G1, G0, G7, D3, G0, \ + G3, G6, D4, G3, G2, G6, D5, G2, G1, G6, D6, G1, G0, G6, D7, G0, \ + G3, G5, D8, G3, G2, G5, D9, G2, G1, G5, D10, G1, G0, G5, D11, G0, \ + G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0 + PTR_ADDI T0, A0, 24 * 8 + GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 + PTR_ADDI T0, A0, 16 * 8 + GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 + PTR_ADDI T0, A0, 8 * 8 + GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 + PTR_ADDI T0, A0, 0 * 8 + GLDREPL xv, d, D9, T0, 0 + + xvfmul.d G3, G3, D0 + GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0 + xvfmul.d G2, G2, D4 + GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0 + xvfmul.d G1, G1, D7 + GNMSUB xvf, d, G0, G1, D8, G0 + xvfmul.d G0, G0, D9 + // Store B +.if \N == 4 + GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8 +.elseif \N == 2 + GST v, , $vr0, B0, 0, $vr2, B0, 2 * 8, $vr4, B0, 4 * 8, $vr6, B0, 6 * 8 +.elseif \N == 1 + GST f, d, $f0, B0, 0, $f2, B0, 1 * 8, $f4, B0, 2 * 8, $f6, B0, 3 * 8 +.endif + // Transpose + GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5 + // Store C +.if \N == 4 + GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0 +.elseif \N == 2 + GST xv, , D0, C0, 0, D1, C1, 0 +.elseif \N == 1 + GST xv, , D0, C0, 0 +.endif + +#undef G0 +#undef G1 +#undef G2 +#undef G3 +#undef G4 +#undef G5 +#undef G6 +#undef G7 +.endm + +.macro dsolve_4 N +// if N = 4 the data layout of C is as follows: +// U0 +// U1 +// U2 +// U3 +// if N = 2 the dat layout of C is as follows: +// U0 +// U1 +// if N = 1 the dat layout of C is as follows: +// U0 +// The matrix A has dimensions of 4x4, and +// it will be divided into 1 segments for processing. + +#define G0 U0 +#define G1 U1 +#define G2 U2 +#define G3 U3 + // Transpose U3 U2 U1 U0 + GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, D0, D1 + // A + // G0 G1 G2 G3 + //------------- + // 0 | D9 + // 4 5 | D8 D7 + // 8 9 10 | D6 D5 D4 + // 12 13 14 15 | D3 D2 D1 D0 + GLDREPL xv, d, D3, A0, 12 * 8, D2, A0, 13 * 8, D1, A0, 14 * 8, D0, A0, 15 * 8, \ + D6, A0, 8 * 8, D5, A0, 9 * 8, D4, A0, 10 * 8, \ + D8, A0, 4 * 8, D7, A0, 5 * 8, \ + D9, A0, 0 * 8 + xvfmul.d G3, G3, D0 + GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0 + xvfmul.d G2, G2, D4 + GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0 + xvfmul.d G1, G1, D7 + GNMSUB xvf, d, G0, G1, D8, G0 + xvfmul.d G0, G0, D9 + // Store B +.if \N == 4 + GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8 +.elseif \N == 2 + GST v, , $vr0, B0, 0, $vr1, B0, 2 * 8, $vr2, B0, 4 * 8, $vr3, B0, 6 * 8 +.elseif \N == 1 + GST f, d, $f0, B0, 0, $f1, B0, 1 * 8, $f2, B0, 2 * 8, $f3, B0, 3 * 8 +.endif + // Transpose + GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5 + // Store C +.if \N == 4 + GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0 +.elseif \N == 2 + GST xv, , D0, C0, 0, D1, C1, 0 +.elseif \N == 1 + GST xv, , D0, C0, 0 +.endif + +#undef G0 +#undef G1 +#undef G2 +#undef G3 +.endm + +.macro dsolve_2 N +#define G0 U2 +#define G1 U3 + // Transpose + GSBUTTERFLY xv, d, G0, G1, U1, U0 + // A + // G0 G1 + // ------ + // 0 | D2 + // 2 3 | D1 D0 + GLDREPL xv, d, D2, A0, 0, D1, A0, 2 * 8, D0, A0, 3 * 8 + xvfmul.d G1, G1, D0 + GNMSUB xvf, d, G0, G1, D1, G0 + xvfmul.d G0, G0, D2 + // Store B +.if \N == 4 + GST xv, , G0, B0, 0, G1, B0, 4 * 8 +.elseif \N == 2 + GST v, , $vr2, B0, 0, $vr3, B0, 2 * 8 +.elseif \N == 1 + GST f, d, $f2, B0, 0, $f3, B0, 8 +.endif + // Transpose + GSBUTTERFLY xv, d, D0, D1, G1, G0 + // Store C +.if \N == 4 + vst $vr16, C0, 0x00 + vst $vr17, C1, 0x00 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D1, C3, 0x00, 0x02 + xvstelm.d D0, C2, 0x08, 0x03 + xvstelm.d D1, C3, 0x08, 0x03 +.elseif \N == 2 + GST v, , $vr16, C0, 0, $vr17, C1, 0 +.elseif \N == 1 + GST v, , $vr16, C0, 0 +.endif + +#undef G0 +#undef G1 +.endm + +.macro dgemm_dsolve_16x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_16x4_load + dgemm_16x4 + b .L_dsolve_16x4 +.L_dsolve_16x4_load: + // Load C + GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 + GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 + GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 + GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +/********************** solver ******************/ +.L_dsolve_16x4: + PTR_ADDI A0, T1, -(16 * 8 * 8) + PTR_ADDI A0, A0, -(16 * 8 * 8) + PTR_ADDI B0, T2, -(16 * 4 * 8) + dsolve_16 4 +.endm + +.macro dgemm_dsolve_1x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_1x4_load + dgemm_1x4 + b .L_dsolve_1x4 +.L_dsolve_1x4_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + fld.d $f2, C2, 0x00 + fld.d $f3, C3, 0x00 + xvinsve0.d U0, U1, 0x01 + xvinsve0.d U0, U2, 0x02 + xvinsve0.d U0, U3, 0x03 +.L_dsolve_1x4: + or A0, T1, T1 + or B0, T2, T2 + GLDREPL xv, d, D0, A0, -1 * 8 + GMUL xvf, d, U0, U0, D0 + // Store C + xvstelm.d U0, C0, 0x00, 0x00 + xvstelm.d U0, C1, 0x00, 0x01 + xvstelm.d U0, C2, 0x00, 0x02 + xvstelm.d U0, C3, 0x00, 0x03 + // Store B + xvst U0, B0, -32 +.endm + +.macro dgemm_dsolve_2x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_2x4_load + dgemm_2x4 + b .L_dsolve_2x4 +.L_dsolve_2x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U1, U3, 0x02 +/********************** solver ******************/ +.L_dsolve_2x4: + PTR_ADDI A0, T1, -(2 * 2 * 8) + PTR_ADDI B0, T2, -(2 * 4 * 8) + dsolve_2 4 +.endm + +.macro dgemm_dsolve_4x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_4x4_load + dgemm_4x4 + b .L_dsolve_4x4 +.L_dsolve_4x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/************** solver *****************/ +.L_dsolve_4x4: + PTR_ADDI A0, T1, -(4 * 4 * 8) + PTR_ADDI B0, T2, -(4 * 4 * 8) + + dsolve_4 4 +.endm + +.macro dgemm_dsolve_8x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_8x4_load + dgemm_8x4 + b .L_dsolve_8x4 +.L_dsolve_8x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + + /* Load C2 */ + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + + /* Load C3 */ + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 +/********* solver *********/ +.L_dsolve_8x4: + PTR_ADDI A0, T1, -(8 * 8 * 8) + PTR_ADDI B0, T2, -(8 * 4 * 8) + + dsolve_8 4 +.endm + +.macro dgemm_dsolve_4x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_4x2_load + dgemm_4x2 + b .L_dsolve_4x2 +.L_dsolve_4x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_4x2: + PTR_ADDI A0, T1, -(4 * 4 * 8) + PTR_ADDI B0, T2, -(4 * 2 * 8) + + dsolve_4 2 +.endm + +.macro dgemm_dsolve_2x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_2x2_load + dgemm_2x2 + b .L_dsolve_2x2 +.L_dsolve_2x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_2x2: + PTR_ADDI A0, T1, -(2 * 2 * 8) + PTR_ADDI B0, T2, -(2 * 2 * 8) + + dsolve_2 2 +.endm + +.macro dgemm_dsolve_8x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_8x2_load + dgemm_8x2 + b .L_dsolve_8x2 +.L_dsolve_8x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 +.L_dsolve_8x2: + PTR_ADDI A0, T1, -(8 * 8 * 8) + PTR_ADDI B0, T2, -(8 * 2 * 8) + + dsolve_8 2 +.endm + +.macro dgemm_dsolve_16x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_16x2_load + dgemm_16x2 + b .L_dsolve_16x2 +.L_dsolve_16x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + /* Load C1 */ + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 +.L_dsolve_16x2: + PTR_ADDI A0, T1, -(16 * 8 * 8) + PTR_ADDI A0, A0, -(16 * 8 * 8) + PTR_ADDI B0, T2, -(16 * 2 * 8) + + dsolve_16 2 +.endm + +.macro dgemm_dsolve_2x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_2x1_load + dgemm_2x1 + b .L_dsolve_2x1 +.L_dsolve_2x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_2x1: + PTR_ADDI A0, T1, -(2 * 2 * 8) + PTR_ADDI B0, T2, -(2 * 1 * 8) + + dsolve_2 1 +.endm + +.macro dgemm_dsolve_4x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_4x1_load + dgemm_4x1 + b .L_dsolve_4x1 +.L_dsolve_4x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_4x1: + PTR_ADDI A0, T1, -(4 * 4 * 8) + PTR_ADDI B0, T2, -(4 * 1 * 8) + + dsolve_4 1 +.endm + +.macro dgemm_dsolve_8x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_8x1_load + dgemm_8x1 + b .L_dsolve_8x1 +.L_dsolve_8x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 +.L_dsolve_8x1: + PTR_ADDI A0, T1, -(8 * 8 * 8) + PTR_ADDI B0, T2, -(8 * 1 * 8) + + dsolve_8 1 +.endm + +.macro dgemm_dsolve_16x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_16x1_load + dgemm_16x1 + b .L_dsolve_16x1 +.L_dsolve_16x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 +.L_dsolve_16x1: + PTR_ADDI A0, T1, -(16 * 8 * 8) + PTR_ADDI A0, A0, -(16 * 8 * 8) + PTR_ADDI B0, T2, -(16 * 1 * 8) + + dsolve_16 1 +.endm + + PROLOGUE + push_if_used 26, 32 + PTR_SLLI LDC, LDC, 3 + /* if (!(N >> 2)) goto L_N3 */ + PTR_SRAI J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + beq ZERO, J, .L_N3 +.align 5 +.L_J1: + PTR_ADDI J, J, -1 + PTR_ADD KK, M, OFFSET + + andi I, M, 15 + beq ZERO, I, .L_M16 + andi I, M, 1 + beqz I, .L_M2 +.L_M1: + PTR_ADDI T0, M, -1 + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ + PTR_ADD CC, T0, C /* c + (m - 1) */ + + PTR_SLLI T0, KK, 5 + PTR_ADD B0, B, T0 /* b + 4 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + dgemm_dsolve_1x4 + PTR_ADDI KK, KK, -1 +.L_M2: + andi I, M, 2 + beqz I, .L_M4 + PTR_SRLI T0, M, 1 + PTR_SLLI T0, T0, 1 + PTR_ADDI T0, T0, -2 + PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ + PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ + PTR_SLLI T1, KK, 4 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ + PTR_SLLI T0, KK, 5 + PTR_ADD B0, B, T0 /* b + 4 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + dgemm_dsolve_2x4 + PTR_ADDI KK, KK, -2 +.L_M4: + andi I, M, 4 + beqz I, .L_M8 + PTR_SRLI T0, M, 2 + PTR_SLLI T0, T0, 2 + PTR_ADDI T0, T0, -4 + PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ + PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ + PTR_SLLI T1, KK, 5 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ + PTR_SLLI T0, KK, 5 + PTR_ADD B0, B, T0 /* b + 4 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + dgemm_dsolve_4x4 + PTR_ADDI KK, KK, -4 +.L_M8: + andi I, M, 8 + beqz I, .L_M16 + PTR_SRLI T0, M, 3 + PTR_SLLI T0, T0, 3 + PTR_ADDI T0, T0, -8 + PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ + PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ + PTR_SLLI T1, KK, 6 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ + PTR_SLLI T0, KK, 5 + PTR_ADD B0, B, T0 /* b + 4 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + dgemm_dsolve_8x4 + PTR_ADDI KK, KK, -8 +.L_M16: + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M0 + + PTR_SRLI T0, M, 4 + PTR_SLLI T0, T0, 4 + PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, A, AA + PTR_ADD CC, C, T0 +.align 5 +.L_I1: + PTR_SLLI T0, KK, 5 + PTR_ADD B0, B, T0 + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_SLLI T0, KK, 7 + PTR_ADD A0, AA, T0 + dgemm_dsolve_16x4 + PTR_ADDI I, I, -1 + PTR_ADDI KK, KK, -16 + PTR_ADDI CC, CC, -(16 * 8) + PTR_SLLI T0, K, 7 + PTR_SUB AA, AA, T0 + blt ZERO, I, .L_I1 +.L_M0: + PTR_SLLI T0, K, 3 + PTR_ALSL B, T0, B, 2 // b += 4 * k; + PTR_ALSL C, LDC, C, 2 // c += 4 * ldc + blt ZERO, J, .L_J1 +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + + PTR_ADD KK, M, OFFSET + andi I, M, 15 + beq ZERO, I, .L_N3_M16 + andi I, M, 1 + beqz I, .L_N3_M2 +.L_N3_M1: + PTR_ADDI KK, KK, -1 + + PTR_ADDI T0, M, -1 + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ + PTR_ADD CC, T0, C /* c + (m - 1) */ + + PTR_SLLI T0, KK, 4 + PTR_ADD B0, B, T0 /* b + 2 * kk */ + GADD , d, C0, CC, ZERO, C1, C0, LDC + // dgemm_dsolve_1x2 + GLD f, d, $f0, A0, 0, $f1, C0, 0, $f2, C1, 0 + GMUL f, d, $f1, $f1, $f0, $f2, $f2, $f0 + GST f, d, $f1, C0, 0, $f2, C1, 0, $f1, B0, 0, $f2, B0, 8 +.L_N3_M2: + andi I, M, 2 + beqz I, .L_N3_M4 + PTR_SRLI T0, M, 1 + PTR_SLLI T0, T0, 1 + PTR_ADDI T0, T0, -2 + PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ + PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ + PTR_SLLI T1, KK, 4 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ + PTR_SLLI T0, KK, 4 + PTR_ADD B0, B, T0 /* b + 2 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC + dgemm_dsolve_2x2 + PTR_ADDI KK, KK, -2 +.L_N3_M4: + andi I, M, 4 + beqz I, .L_N3_M8 + PTR_SRLI T0, M, 2 + PTR_SLLI T0, T0, 2 + PTR_ADDI T0, T0, -4 + PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ + PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ + PTR_SLLI T1, KK, 5 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ + PTR_SLLI T0, KK, 4 + PTR_ADD B0, B, T0 /* b + 2 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC + dgemm_dsolve_4x2 + PTR_ADDI KK, KK, -4 +.L_N3_M8: + andi I, M, 8 + beqz I, .L_N3_M16 + PTR_SRLI T0, M, 3 + PTR_SLLI T0, T0, 3 + PTR_ADDI T0, T0, -8 + PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ + PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ + PTR_SLLI T1, KK, 6 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ + PTR_SLLI T0, KK, 4 + PTR_ADD B0, B, T0 /* b + 2 * kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC + dgemm_dsolve_8x2 + PTR_ADDI KK, KK, -8 +.L_N3_M16: + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N3_M0 + + PTR_SRLI T0, M, 4 + PTR_SLLI T0, T0, 4 + PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, A, AA + PTR_ADD CC, C, T0 +.align 5 +.L_N3_I1: + PTR_SLLI T0, KK, 4 + PTR_ADD B0, B, T0 + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_SLLI T0, KK, 7 + PTR_ADD A0, AA, T0 + dgemm_dsolve_16x2 + PTR_ADDI I, I, -1 + PTR_ADDI KK, KK, -16 + PTR_ADDI CC, CC, -(16 * 8) + PTR_SLLI T0, K, 7 + PTR_SUB AA, AA, T0 + blt ZERO, I, .L_N3_I1 +.L_N3_M0: + PTR_SLLI T0, K, 3 + PTR_ALSL B, T0, B, 1 // b += 2 * k; + PTR_ALSL C, LDC, C, 1 // c += 2 * ldc +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + + PTR_ADD KK, M, OFFSET + andi I, M, 15 + beq ZERO, I, .L_N1_M16 + andi I, M, 1 + beqz I, .L_N1_M2 +.L_N1_M1: + PTR_ADDI KK, KK, -1 + + PTR_ADDI T0, M, -1 + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ + PTR_ADD CC, T0, C /* c + (m - 1) */ + + PTR_SLLI T0, KK, 3 + PTR_ADD B0, B, T0 /* b + kk */ + GADD , d, C0, CC, ZERO + // dgemm_dsolve_1x1 + GLD f, d, $f0, A0, 0, $f1, C0, 0 + GMUL f, d, $f1, $f1, $f0 + GST f, d, $f1, C0, 0, $f1, B0, 0 +.L_N1_M2: + andi I, M, 2 + beqz I, .L_N1_M4 + PTR_SRLI T0, M, 1 + PTR_SLLI T0, T0, 1 + PTR_ADDI T0, T0, -2 + PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ + PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ + PTR_SLLI T1, KK, 4 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ + PTR_SLLI T0, KK, 3 + PTR_ADD B0, B, T0 /* b + kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO + dgemm_dsolve_2x1 + PTR_ADDI KK, KK, -2 +.L_N1_M4: + andi I, M, 4 + beqz I, .L_N1_M8 + PTR_SRLI T0, M, 2 + PTR_SLLI T0, T0, 2 + PTR_ADDI T0, T0, -4 + PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ + PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ + PTR_SLLI T1, KK, 5 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ + PTR_SLLI T0, KK, 3 + PTR_ADD B0, B, T0 /* b + kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO + dgemm_dsolve_4x1 + PTR_ADDI KK, KK, -4 +.L_N1_M8: + andi I, M, 8 + beqz I, .L_N1_M16 + PTR_SRLI T0, M, 3 + PTR_SLLI T0, T0, 3 + PTR_ADDI T0, T0, -8 + PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ + PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ + PTR_SLLI T1, KK, 6 + PTR_MUL AA, T0, K + PTR_ADD AA, AA, A + PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ + PTR_SLLI T0, KK, 3 + PTR_ADD B0, B, T0 /* b + kk */ + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO + dgemm_dsolve_8x1 + PTR_ADDI KK, KK, -8 +.L_N1_M16: + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M0 + + PTR_SRLI T0, M, 4 + PTR_SLLI T0, T0, 4 + PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ + PTR_SLLI T0, T0, 3 + PTR_MUL AA, T0, K + PTR_ADD AA, A, AA + PTR_ADD CC, C, T0 +.align 5 +.L_N1_I1: + PTR_SLLI T0, KK, 3 + PTR_ADD B0, B, T0 + PTR_SUB L, K, KK + GADD , d, C0, CC, ZERO + PTR_SLLI T0, KK, 7 + PTR_ADD A0, AA, T0 + dgemm_dsolve_16x1 + PTR_ADDI I, I, -1 + PTR_ADDI KK, KK, -16 + PTR_ADDI CC, CC, -(16 * 8) + PTR_SLLI T0, K, 7 + PTR_SUB AA, AA, T0 + blt ZERO, I, .L_N1_I1 +.L_N1_M0: +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S new file mode 100644 index 000000000..0e2cacccf --- /dev/null +++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S @@ -0,0 +1,959 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/26 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + * FLOAT *c, BLASLONG ldc, BLASLONG offset) + */ + +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc +#define OFFSET $r11 // param 9: offset + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 +#define T1 $r26 +#define T2 $r27 +#define KK $r28 +#define AA $r29 +#define CC $r30 +#define BB B0 +#undef ZERO +#define ZERO $r0 + +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 +#define G0 D0 +#define G1 D1 +#define G2 D2 +#define G3 D3 +#define G4 D4 +#define G5 D5 +#define G6 D6 +#define G7 D7 +#define G8 D8 +#define G9 D9 +#define G10 D10 +#define G11 D11 +#define G12 D12 +#define G13 D13 +#define G14 D14 +#define G15 D15 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +#include "dtrsm_kernel_macro.S" + +.macro ldrepl_macro start, end, stride +// Load Ux (x = 0...15) +.if \start <= \end + GLDREPL xv, d, $xr\start, A0, \stride * 8 + ldrepl_macro %start + 1, \end, %stride + 1 +.endif +.endm +.macro nmsub_macro start0, end0, start1, reg +// Gx -= reg * Ux +.if \start0 <= \end0 + xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 + nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.endif +.endm +.macro B_st_macro start, end, stride, N +// Store Gx(x = 16...31) +.if \start <= \end +.if \N == 4 + xvst $xr\start, B0, \stride * 0x20 +.elseif \N == 2 + vst $vr\start, B0, \stride * 0x10 +.elseif \N == 1 + fst.d $f\start, B0, \stride * 0x08 +.endif + B_st_macro %start + 1, \end, %stride + 1, \N +.endif +.endm + +.macro dsolve_16 N +// The data layout of C (4x16) is as follows (store 4 data in each register): +// U0 U1 U2 U3 +// U4 U5 U6 U7 +// U8 U9 U10 U11 +// U12 U13 U14 U15 +// The first step is to transpose the result of C + GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1 + GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1 + GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, U3, U7 + GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, U3, U7 +// Now we have the following memory layout of C: +// 0 1 2 3 ... 15 +// 0 | | | | | | | +// 1 | G0 | G1 | G2 | G3 | ... | G15 | +// 2 | | | | | | | +// 3 | | | | | | | +// Next we are going to process matrix A with a size of 16x16, +// using only the upper triangular portion. The memory layout of +// matrix A is as follows, quite large. +//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// 34 35 36 37 38 39 40 41 42 43 44 45 46 47 +// 51 52 53 54 55 56 57 58 59 60 61 62 63 +// 68 69 70 71 72 73 74 75 76 77 78 79 +// 85 86 87 88 89 90 91 92 93 94 95 +// 102 103 104 105 106 107 108 109 110 111 +// 119 120 121 122 123 124 125 126 127 +// 136 137 138 139 140 141 142 143 +// 153 154 155 156 157 158 159 +// 170 171 172 173 174 175 +// 187 188 189 190 191 +// 204 205 206 207 +// 221 222 223 +// 238 239 +// 255 +// Sequentially extract data from A in row order +// Load 0 + ldrepl_macro 0, 15, 0 + GMUL xvf, d, G0, G0, U0 + nmsub_macro 17, 31, 1, G0 + PTR_ADDI A0, A0, 17 * 8 +// Load 1 + ldrepl_macro 1, 15, 0 + GMUL xvf, d, G1, G1, U1 + nmsub_macro 18, 31, 2, G1 + PTR_ADDI A0, A0, 17 * 8 +// Load 2 + ldrepl_macro 2, 15, 0 + GMUL xvf, d, G2, G2, U2 + nmsub_macro 19, 31, 3, G2 + PTR_ADDI A0, A0, 17 * 8 +// Load 3 + ldrepl_macro 3, 15, 0 + GMUL xvf, d, G3, G3, U3 + nmsub_macro 20, 31, 4, G3 + PTR_ADDI A0, A0, 17 * 8 +// Load 4 + ldrepl_macro 4, 15, 0 + GMUL xvf, d, G4, G4, U4 + nmsub_macro 21, 31, 5, G4 + PTR_ADDI A0, A0, 17 * 8 +// Load 5 + ldrepl_macro 5, 15, 0 + GMUL xvf, d, G5, G5, U5 + nmsub_macro 22, 31, 6, G5 + PTR_ADDI A0, A0, 17 * 8 +// Load 6 + ldrepl_macro 6, 15, 0 + GMUL xvf, d, G6, G6, U6 + nmsub_macro 23, 31, 7, G6 + PTR_ADDI A0, A0, 17 * 8 +// Load 7 + ldrepl_macro 7, 15, 0 + GMUL xvf, d, G7, G7, U7 + nmsub_macro 24, 31, 8, G7 + PTR_ADDI A0, A0, 17 * 8 +// Load 8 + ldrepl_macro 8, 15, 0 + GMUL xvf, d, G8, G8, U8 + nmsub_macro 25, 31, 9, G8 + PTR_ADDI A0, A0, 17 * 8 +// Load 9 + ldrepl_macro 9, 15, 0 + GMUL xvf, d, G9, G9, U9 + nmsub_macro 26, 31, 10, G9 + PTR_ADDI A0, A0, 17 * 8 +// Load 10 + ldrepl_macro 10, 15, 0 + GMUL xvf, d, G10, G10, U10 + nmsub_macro 27, 31, 11, G10 + PTR_ADDI A0, A0, 17 * 8 +// Load 11 + ldrepl_macro 11, 15, 0 + GMUL xvf, d, G11, G11, U11 + nmsub_macro 28, 31, 12, G11 + PTR_ADDI A0, A0, 17 * 8 +// Load 12 + ldrepl_macro 12, 15, 0 + GMUL xvf, d, G12, G12, U12 + nmsub_macro 29, 31, 13, G12 + PTR_ADDI A0, A0, 17 * 8 +// Load 13 + ldrepl_macro 13, 15, 0 + GMUL xvf, d, G13, G13, U13 + nmsub_macro 30, 31, 14, G13 + PTR_ADDI A0, A0, 17 * 8 +// Load 14 + ldrepl_macro 14, 15, 0 + GMUL xvf, d, G14, G14, U14 + nmsub_macro 31, 31, 15, G14 + PTR_ADDI A0, A0, 17 * 8 +// Load 15 + ldrepl_macro 15, 15, 0 + GMUL xvf, d, G15, G15, U15 +// Finally, We can store the result. +// For B, stored sequentially, and C, first transpose and then store + B_st_macro 16, 31, 0, \N + GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 + GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 + GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1 + GTRANSPOSE4x4_D G12, G13, G14, G15, G12, G13, G14, G15, U0, U1 +.if \N == 4 + GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60, \ + G1, C1, 0x00, G5, C1, 0x20, G9, C1, 0x40, G13, C1, 0x60, \ + G2, C2, 0x00, G6, C2, 0x20, G10, C2, 0x40, G14, C2, 0x60, \ + G3, C3, 0x00, G7, C3, 0x20, G11, C3, 0x40, G15, C3, 0x60 +.elseif \N == 2 + GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60, \ + G1, C1, 0x00, G5, C1, 0x20, G9, C1, 0x40, G13, C1, 0x60 +.elseif \N == 1 + GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60 +.endif +.endm + +.macro dgemm_dsolve_16x4 + bge ZERO, KK, .L_dsolve_16x4_load + dgemm_16x4 + b .L_dsolve_16x4 +.L_dsolve_16x4_load: + // Load C + GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 + GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 + GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 + GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +/********************** solver ******************/ +.L_dsolve_16x4: + dsolve_16 4 +.endm + +.macro dsolve_8 N +// The data layout of C (4x8) is as follows (store 4 data in each register): +// U0 U1 +// U2 U3 +// U4 U5 +// U6 U7 +// The first step is to transpose the result of C + GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, G8, G9 + GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, G8, G9 +// Now we have the following memory layout of C: +// 0 1 2 3 ... 7 +// 0 | | | | | | | +// 1 | G0 | G1 | G2 | G3 | ... | G7 | +// 2 | | | | | | | +// 3 | | | | | | | +// Next we are going to process matrix A with a size of 8x8, +// using only the upper triangular portion. The memory layout of +// matrix A is as follows: +//0 1 2 3 4 5 6 7 +// 9 10 11 12 13 14 15 +// 18 19 20 21 22 23 +// 27 28 29 30 31 +// 36 37 38 39 +// 45 46 47 +// 54 55 +// 63 +// Sequentially extract data from A in row order +// Load 0 + ldrepl_macro 0, 7, 0 + GMUL xvf, d, G0, G0, U0 + nmsub_macro 17, 23, 1, G0 + PTR_ADDI A0, A0, 9 * 8 +// Load 1 + ldrepl_macro 1, 7, 0 + GMUL xvf, d, G1, G1, U1 + nmsub_macro 18, 23, 2, G1 + PTR_ADDI A0, A0, 9 * 8 +// Load 2 + ldrepl_macro 2, 7, 0 + GMUL xvf, d, G2, G2, U2 + nmsub_macro 19, 23, 3, G2 + PTR_ADDI A0, A0, 9 * 8 +// Load 3 + ldrepl_macro 3, 7, 0 + GMUL xvf, d, G3, G3, U3 + nmsub_macro 20, 23, 4, G3 + PTR_ADDI A0, A0, 9 * 8 +// Load 4 + ldrepl_macro 4, 7, 0 + GMUL xvf, d, G4, G4, U4 + nmsub_macro 21, 23, 5, G4 + PTR_ADDI A0, A0, 9 * 8 +// Load 5 + ldrepl_macro 5, 7, 0 + GMUL xvf, d, G5, G5, U5 + nmsub_macro 22, 23, 6, G5 + PTR_ADDI A0, A0, 9 * 8 +// Load 6 + ldrepl_macro 6, 7, 0 + GMUL xvf, d, G6, G6, U6 + nmsub_macro 23, 23, 7, G6 + PTR_ADDI A0, A0, 9 * 8 +// Load 7 + ldrepl_macro 7, 7, 0 + GMUL xvf, d, G7, G7, U7 +// Finally, We can store the result. +// For B, stored sequentially, and C, first transpose and then store + B_st_macro 16, 23, 0, \N + GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 + GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 +.if \N == 4 + GST xv, , G0, C0, 0x00, G4, C0, 0x20, \ + G1, C1, 0x00, G5, C1, 0x20, \ + G2, C2, 0x00, G6, C2, 0x20, \ + G3, C3, 0x00, G7, C3, 0x20 +.elseif \N == 2 + GST xv, , G0, C0, 0x00, G4, C0, 0x20, \ + G1, C1, 0x00, G5, C1, 0x20 +.elseif \N == 1 + GST xv, , G0, C0, 0x00, G4, C0, 0x20 +.endif +.endm + +.macro dgemm_dsolve_8x4 + bge ZERO, L, .L_dsolve_8x4_load + dgemm_8x4 + b .L_dsolve_8x4 +.L_dsolve_8x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + + /* Load C2 */ + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + + /* Load C3 */ + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 +/********* solver *********/ +.L_dsolve_8x4: + dsolve_8 4 +.endm + +.macro dsolve_4 N +// The data layout of C (4x4) is as follows (store 4 data in each register): +// U0 +// U1 +// U2 +// U3 +// The first step is to transpose the result of C + GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, G4, G5 +// Now we have the following memory layout of C: +// 0 1 2 3 +// 0 | | | | | +// 1 | G0 | G1 | G2 | G3 | +// 2 | | | | | +// 3 | | | | | +// Next we are going to process matrix A with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix A is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from A in row order +// Load 0 + ldrepl_macro 0, 3, 0 + GMUL xvf, d, G0, G0, U0 + nmsub_macro 17, 19, 1, G0 + PTR_ADDI A0, A0, 5 * 8 +// Load 1 + ldrepl_macro 1, 3, 0 + GMUL xvf, d, G1, G1, U1 + nmsub_macro 18, 19, 2, G1 + PTR_ADDI A0, A0, 5 * 8 +// Load 2 + ldrepl_macro 2, 3, 0 + GMUL xvf, d, G2, G2, U2 + nmsub_macro 19, 19, 3, G2 + PTR_ADDI A0, A0, 5 * 8 +// Load 3 + ldrepl_macro 3, 3, 0 + GMUL xvf, d, G3, G3, U3 +// Finally, We can store the result. +// For B, stored sequentially, and C, first transpose and then store + B_st_macro 16, 19, 0, \N + GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 +.if \N == 4 + GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00 +.elseif \N == 2 + GST xv, , G0, C0, 0x00, G1, C1, 0x00 +.elseif \N == 1 + GST xv, , G0, C0, 0x00 +.endif +.endm + +.macro dgemm_dsolve_4x4 + bge ZERO, L, .L_dsolve_4x4_load + dgemm_4x4 + b .L_dsolve_4x4 +.L_dsolve_4x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/************** solver *****************/ +.L_dsolve_4x4: + dsolve_4 4 +.endm + +.macro dsolve_2 N +// Transpose + GSBUTTERFLY xv, d, G0, G1, U1, U0 +// Now we have the following memory layout of C: +// 0 1 +// 0 | | | +// 1 | G0 | G1 | +// 2 | | | +// 3 | | | +// Next we are going to process matrix A with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix A is as follows: +//0 1 +// 3 +// Sequentially extract data from A in row order +// Load 0 + ldrepl_macro 0, 1, 0 + GMUL xvf, d, G0, G0, U0 + nmsub_macro 17, 17, 1, G0 + PTR_ADDI A0, A0, 3 * 8 +// Load 1 + ldrepl_macro 1, 1, 0 + GMUL xvf, d, G1, G1, U1 +// Finally, We can store the result. +// For B, stored sequentially, and C, first transpose and then store + B_st_macro 16, 17, 0, \N + GSBUTTERFLY xv, d, U0, U1, G1, G0 +.if \N == 4 + vst $vr0, C0, 0x00 + vst $vr1, C1, 0x00 + xvstelm.d U0, C2, 0x00, 0x02 + xvstelm.d U1, C3, 0x00, 0x02 + xvstelm.d U0, C2, 0x08, 0x03 + xvstelm.d U1, C3, 0x08, 0x03 +.elseif \N == 2 + vst $vr0, C0, 0x00 + vst $vr1, C1, 0x00 +.elseif \N == 1 + vst $vr0, C0, 0x00 +.endif +.endm + +.macro dgemm_dsolve_2x4 + bge ZERO, L, .L_dsolve_2x4_load + dgemm_2x4 + b .L_dsolve_2x4 +.L_dsolve_2x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U1, U3, 0x02 +/********************** solver ******************/ +.L_dsolve_2x4: + dsolve_2 4 +.endm + +.macro dgemm_dsolve_1x4 + bge ZERO, L, .L_dsolve_1x4_load + dgemm_1x4 + b .L_dsolve_1x4 +.L_dsolve_1x4_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + fld.d $f2, C2, 0x00 + fld.d $f3, C3, 0x00 + xvinsve0.d U0, U1, 0x01 + xvinsve0.d U0, U2, 0x02 + xvinsve0.d U0, U3, 0x03 +.L_dsolve_1x4: + GLDREPL xv, d, D0, A0, 0x00 + GMUL xvf, d, U0, U0, D0 + // Store C + xvstelm.d U0, C0, 0x00, 0x00 + xvstelm.d U0, C1, 0x00, 0x01 + xvstelm.d U0, C2, 0x00, 0x02 + xvstelm.d U0, C3, 0x00, 0x03 + // Store B + xvst U0, B0, 0x00 +.endm + +.macro dgemm_dsolve_16x2 + bge ZERO, L, .L_dsolve_16x2_load + dgemm_16x2 + b .L_dsolve_16x2 +.L_dsolve_16x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + /* Load C1 */ + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 +.L_dsolve_16x2: + dsolve_16 2 +.endm + +.macro dgemm_dsolve_8x2 + bge ZERO, L, .L_dsolve_8x2_load + dgemm_8x2 + b .L_dsolve_8x2 +.L_dsolve_8x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 +.L_dsolve_8x2: + dsolve_8 2 +.endm + +.macro dgemm_dsolve_4x2 + bge ZERO, L, .L_dsolve_4x2_load + dgemm_4x2 + b .L_dsolve_4x2 +.L_dsolve_4x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_4x2: + dsolve_4 2 +.endm + +.macro dgemm_dsolve_1x2 + bge ZERO, L, .L_dsolve_1x2_load + dgemm_1x2 + b .L_dsolve_1x2 +.L_dsolve_1x2_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + xvinsve0.d U0, U1, 0x01 +.L_dsolve_1x2: + GLDREPL xv, d, D0, A0, 0x00 + GMUL xvf, d, U0, U0, D0 + // Store C + xvstelm.d U0, C0, 0x00, 0x00 + xvstelm.d U0, C1, 0x00, 0x01 + // Store B + vst $vr0, B0, 0x00 +.endm + +.macro dgemm_dsolve_2x2 + bge ZERO, L, .L_dsolve_2x2_load + dgemm_2x2 + b .L_dsolve_2x2 +.L_dsolve_2x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_2x2: + dsolve_2 2 +.endm + +.macro dgemm_dsolve_16x1 + bge ZERO, L, .L_dsolve_16x1_load + dgemm_16x1 + b .L_dsolve_16x1 +.L_dsolve_16x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 +.L_dsolve_16x1: + dsolve_16 1 +.endm + +.macro dgemm_dsolve_8x1 + bge ZERO, L, .L_dsolve_8x1_load + dgemm_8x1 + b .L_dsolve_8x1 +.L_dsolve_8x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 +.L_dsolve_8x1: + dsolve_8 1 +.endm + +.macro dgemm_dsolve_4x1 + bge ZERO, L, .L_dsolve_4x1_load + dgemm_4x1 + b .L_dsolve_4x1 +.L_dsolve_4x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_4x1: + dsolve_4 1 +.endm + +.macro dgemm_dsolve_2x1 + bge ZERO, L, .L_dsolve_2x1_load + dgemm_2x1 + b .L_dsolve_2x1 +.L_dsolve_2x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_2x1: + dsolve_2 1 +.endm + +.macro dgemm_dsolve_1x1 + bge ZERO, L, .L_dsolve_1x1_load + dgemm_1x1 + b .L_dsolve_1x1 +.L_dsolve_1x1_load: + // Load C + fld.d $f0, C0, 0x00 +.L_dsolve_1x1: + GLDREPL xv, d, D0, A0, 0x00 + GMUL xvf, d, U0, U0, D0 + // Store C + xvstelm.d U0, C0, 0x00, 0x00 + // Store B + xvstelm.d U0, B0, 0x00, 0x00 +.endm + + PROLOGUE + push_if_used 26, 32 + PTR_SLLI LDC, LDC, 3 + /* if (!(N >> 2)) goto L_N3 */ + PTR_SRAI J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + beq ZERO, J, .L_N3 +.align 5 +.L_J1: + PTR_ADDI J, J, -1 + move KK, OFFSET + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_M15 +.align 4 +.L_I1: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x4 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADDI KK, KK, 0x10 // kk += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_I1 +.L_M15: + andi I, M, 8 + beqz I, .L_M7 +.L_M8: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x4 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADDI KK, KK, 0x08 // kk += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_M7: + andi I, M, 4 + beqz I, .L_M3 +.L_M4: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x4 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADDI KK, KK, 0x04 // kk += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_M3: + andi I, M, 2 + beqz I, .L_M1 +.L_M2: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x4 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADDI KK, KK, 0x02 // kk += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_M1: + andi I, M, 1 + beqz I, .L_M0 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x4 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADDI KK, KK, 0x01 // kk += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_M0: + PTR_SLLI T0, K, 5 + PTR_SLLI T1, LDC, 2 + PTR_ADD B, B, T0 // b += 4 * k + PTR_ADD C, C, T1 // c += 4 * ldc + bnez J, .L_J1 +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 +.L_N2: + move KK, OFFSET + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N2_M15 +.align 4 +.L_N2_I1: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x2 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADDI KK, KK, 0x10 // kk += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N2_I1 +.L_N2_M15: + andi I, M, 8 + beqz I, .L_N2_M7 +.L_N2_M8: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x2 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADDI KK, KK, 0x08 // kk += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N2_M7: + andi I, M, 4 + beqz I, .L_N2_M3 +.L_N2_M4: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x2 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADDI KK, KK, 0x04 // kk += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N2_M3: + andi I, M, 2 + beqz I, .L_N2_M1 +.L_N2_M2: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x2 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADDI KK, KK, 0x02 // kk += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N2_M1: + andi I, M, 1 + beqz I, .L_N2_M0 + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x2 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADDI KK, KK, 0x01 // kk += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N2_M0: + PTR_SLLI T0, K, 4 + PTR_SLLI T1, LDC, 1 + PTR_ADD B, B, T0 // b += 2 * k + PTR_ADD C, C, T1 // c += 2 * ldc +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + + move KK, OFFSET + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N1_M15 +.align 4 +.L_N1_I1: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x1 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADDI KK, KK, 0x10 // kk += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N1_I1 +.L_N1_M15: + andi I, M, 8 + beqz I, .L_N1_M7 +.L_N1_M8: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x1 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADDI KK, KK, 0x08 // kk += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N1_M7: + andi I, M, 4 + beqz I, .L_N1_M3 +.L_N1_M4: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x1 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADDI KK, KK, 0x04 // kk += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N1_M3: + andi I, M, 2 + beqz I, .L_N1_M1 +.L_N1_M2: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x1 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADDI KK, KK, 0x02 // kk += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N1_M1: + andi I, M, 1 + beqz I, .L_N1_M0 + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x1 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADDI KK, KK, 0x01 // kk += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N1_M0: +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S new file mode 100644 index 000000000..421339736 --- /dev/null +++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S @@ -0,0 +1,882 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/09/26 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + * FLOAT *c, BLASLONG ldc, BLASLONG offset) + */ + +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc +#define OFFSET $r11 // param 9: offset + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 +#define T1 $r26 +#define T2 $r27 +#define KK $r28 +#define AA $r29 +#define CC $r30 +#define BB B0 +#undef ZERO +#define ZERO $r0 + +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 +#define G0 D0 +#define G1 D1 +#define G2 D2 +#define G3 D3 +#define G4 D4 +#define G5 D5 +#define G6 D6 +#define G7 D7 +#define G8 D8 +#define G9 D9 +#define G10 D10 +#define G11 D11 +#define G12 D12 +#define G13 D13 +#define G14 D14 +#define G15 D15 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +#include "dtrsm_kernel_macro.S" + +.macro ldrepl_macro start, end, stride +// Load Ux (x = 0...15) +.if \start <= \end + GLDREPL xv, d, $xr\start, B0, \stride * 8 + ldrepl_macro %start + 1, \end, %stride + 1 +.endif +.endm + +.macro nmsub_macro start0, end0, start1, reg +// Ux -= reg * Dx +.if \start0 <= \end0 + xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 + nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.endif +.endm + +.macro A_st_macro start, end, stride, N +// Store Ux(x = 0...15) +.if \start <= \end +.if \N == 4 + xvst $xr\start, A0, \stride * 0x20 +.elseif \N == 2 + vst $vr\start, A0, \stride * 0x10 +.elseif \N == 1 + fst.d $f\start, A0, \stride * 0x08 +.endif + A_st_macro %start + 1, \end, %stride + 1, \N +.endif +.endm + +.macro dsolve_16x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from B in row order + ldrepl_macro 16, 19, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 + ldrepl_macro 20, 22, 5 + nmsub_macro 4, 7, 0, D1 + ldrepl_macro 23, 24, 10 + GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7 + ldrepl_macro 25, 25, 15 + nmsub_macro 8, 11, 0, D2 + nmsub_macro 8, 11, 4, D5 + GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11 + nmsub_macro 12, 15, 0, D3 + nmsub_macro 12, 15, 4, D6 + nmsub_macro 12, 15, 8, D8 + GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 +// Store A + A_st_macro 0, 15, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ + U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ + U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \ + U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +.endm + +.macro dsolve_16x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 +// 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 17, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 + ldrepl_macro 18, 18, 3 + nmsub_macro 4, 7, 0, D1 + GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 +// Store A + A_st_macro 0, 7, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ + U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 +.endm + +.macro dsolve_8x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from B in row order + ldrepl_macro 16, 19, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 + ldrepl_macro 20, 22, 5 + nmsub_macro 2, 3, 0, D1 + ldrepl_macro 23, 24, 10 + GMUL xvf, d, U2, D4, U2, U3, D4, U3 + ldrepl_macro 25, 25, 15 + nmsub_macro 4, 5, 0, D2 + nmsub_macro 4, 5, 2, D5 + GMUL xvf, d, U4, D7, U4, U5, D7, U5 + nmsub_macro 6, 7, 0, D3 + nmsub_macro 6, 7, 2, D6 + nmsub_macro 6, 7, 4, D8 + GMUL xvf, d, U6, D9, U6, U7, D9, U7 +// Store A + A_st_macro 0, 7, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ + U2, C1, 0x00, U3, C1, 0x20, \ + U4, C2, 0x00, U5, C2, 0x20, \ + U6, C3, 0x00, U7, C3, 0x20 +.endm + +.macro dsolve_8x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 +// 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 17, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 + ldrepl_macro 18, 18, 3 + nmsub_macro 2, 3, 0, D1 + GMUL xvf, d, U2, D2, U2, U3, D2, U3 +// Store A + A_st_macro 0, 3, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ + U2, C1, 0x00, U3, C1, 0x20 +.endm + +.macro dsolve_4x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from B in row order + ldrepl_macro 16, 19, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 20, 22, 5 + nmsub_macro 1, 1, 0, D1 + ldrepl_macro 23, 24, 10 + GMUL xvf, d, U1, D4, U1 + ldrepl_macro 25, 25, 15 + nmsub_macro 2, 2, 0, D2 + nmsub_macro 2, 2, 1, D5 + GMUL xvf, d, U2, D7, U2 + nmsub_macro 3, 3, 0, D3 + nmsub_macro 3, 3, 1, D6 + nmsub_macro 3, 3, 2, D8 + GMUL xvf, d, U3, D9, U3 +// Store A + A_st_macro 0, 3, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 +.endm + +.macro dsolve_4x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 +// 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 17, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 18, 18, 3 + nmsub_macro 1, 1, 0, D1 + GMUL xvf, d, U1, D2, U1 +// Store A + A_st_macro 0, 1, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C1, 0x00 +.endm + +.macro dsolve_2x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from B in row order + ldrepl_macro 16, 19, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 20, 22, 5 + nmsub_macro 1, 1, 0, D1 + ldrepl_macro 23, 24, 10 + GMUL xvf, d, U1, D4, U1 + + ldrepl_macro 25, 25, 15 + nmsub_macro 2, 2, 0, D2 + nmsub_macro 2, 2, 1, D5 + GMUL xvf, d, U2, D7, U2 + nmsub_macro 3, 3, 0, D3 + nmsub_macro 3, 3, 1, D6 + nmsub_macro 3, 3, 2, D8 + GMUL xvf, d, U3, D9, U3 +// Store A + A_st_macro 0, 3, 0, 2 +// Store C + GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00, +.endm + +.macro dsolve_2x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 +// 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 17, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 18, 18, 3 + nmsub_macro 1, 1, 0, D1 + GMUL xvf, d, U1, D2, U1 +// Store A + A_st_macro 0, 1, 0, 2 +// Store C + GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 +.endm + +.macro dsolve_1x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 2 3 +// 5 6 7 +// 10 11 +// 15 +// Sequentially extract data from B in row order + ldrepl_macro 16, 19, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 20, 22, 5 + nmsub_macro 1, 1, 0, D1 + ldrepl_macro 23, 24, 10 + GMUL xvf, d, U1, D4, U1 + + ldrepl_macro 25, 25, 15 + nmsub_macro 2, 2, 0, D2 + nmsub_macro 2, 2, 1, D5 + GMUL xvf, d, U2, D7, U2 + nmsub_macro 3, 3, 0, D3 + nmsub_macro 3, 3, 1, D6 + nmsub_macro 3, 3, 2, D8 + GMUL xvf, d, U3, D9, U3 +// Store A + A_st_macro 0, 3, 0, 1 +// Store C + GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, +.endm + +.macro dsolve_1x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 1 +// 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 17, 0 + GMUL xvf, d, U0, D0, U0 + ldrepl_macro 18, 18, 3 + nmsub_macro 1, 1, 0, D1 + GMUL xvf, d, U1, D2, U1 +// Store A + A_st_macro 0, 1, 0, 1 +// Store C + GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 +.endm + +.macro dgemm_dsolve_16x4 + bge ZERO, L, .L_dsolve_16x4_load + dgemm_16x4 + b .L_dsolve_16x4 +.L_dsolve_16x4_load: + // Load C + GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 + GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 + GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 + GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +/********************** solver ******************/ +.L_dsolve_16x4: + dsolve_16x4 +.endm + +.macro dgemm_dsolve_8x4 + bge ZERO, L, .L_dsolve_8x4_load + dgemm_8x4 + b .L_dsolve_8x4 +.L_dsolve_8x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + + /* Load C2 */ + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + + /* Load C3 */ + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 +/********* solver *********/ +.L_dsolve_8x4: + dsolve_8x4 +.endm + +.macro dgemm_dsolve_4x4 + bge ZERO, L, .L_dsolve_4x4_load + dgemm_4x4 + b .L_dsolve_4x4 +.L_dsolve_4x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/************** solver *****************/ +.L_dsolve_4x4: + dsolve_4x4 +.endm + +.macro dgemm_dsolve_2x4 + bge ZERO, L, .L_dsolve_2x4_load + dgemm_2x4 + xvpermi.q U2, U0, 0x01 + xvpermi.q U3, U1, 0x01 + b .L_dsolve_2x4 +.L_dsolve_2x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/********************** solver ******************/ +.L_dsolve_2x4: + dsolve_2x4 +.endm + +.macro dgemm_dsolve_1x4 + bge ZERO, L, .L_dsolve_1x4_load + dgemm_1x4 + xvpackod.d U1, U0, U0 + xvpermi.q U2, U0, 0x01 + xvpermi.q U3, U1, 0x01 + b .L_dsolve_1x4 +.L_dsolve_1x4_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + fld.d $f2, C2, 0x00 + fld.d $f3, C3, 0x00 +.L_dsolve_1x4: + dsolve_1x4 +.endm + +.macro dgemm_dsolve_16x2 + bge ZERO, L, .L_dsolve_16x2_load + dgemm_16x2 + b .L_dsolve_16x2 +.L_dsolve_16x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + /* Load C1 */ + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 +.L_dsolve_16x2: + dsolve_16x2 +.endm + +.macro dgemm_dsolve_8x2 + bge ZERO, L, .L_dsolve_8x2_load + dgemm_8x2 + b .L_dsolve_8x2 +.L_dsolve_8x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 +.L_dsolve_8x2: + dsolve_8x2 +.endm + +.macro dgemm_dsolve_4x2 + bge ZERO, L, .L_dsolve_4x2_load + dgemm_4x2 + b .L_dsolve_4x2 +.L_dsolve_4x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_4x2: + dsolve_4x2 +.endm + +.macro dgemm_dsolve_2x2 + bge ZERO, L, .L_dsolve_2x2_load + dgemm_2x2 + b .L_dsolve_2x2 +.L_dsolve_2x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_2x2: + dsolve_2x2 +.endm + +.macro dgemm_dsolve_1x2 + bge ZERO, L, .L_dsolve_1x2_load + dgemm_1x2 + xvpackod.d U1, U0, U0 + b .L_dsolve_1x2 +.L_dsolve_1x2_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 +.L_dsolve_1x2: + dsolve_1x2 +.endm + +.macro dgemm_dsolve_16x1 + bge ZERO, L, .L_dsolve_16x1_load + dgemm_16x1 + b .L_dsolve_16x1 +.L_dsolve_16x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 +.L_dsolve_16x1: + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 + // Store A + A_st_macro 0, 3, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 +.endm + +.macro dgemm_dsolve_8x1 + bge ZERO, L, .L_dsolve_8x1_load + dgemm_8x1 + b .L_dsolve_8x1 +.L_dsolve_8x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 +.L_dsolve_8x1: + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 + // Store A + A_st_macro 0, 1, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00, U1, C0, 0x20 +.endm + +.macro dgemm_dsolve_4x1 + bge ZERO, L, .L_dsolve_4x1_load + dgemm_4x1 + b .L_dsolve_4x1 +.L_dsolve_4x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_4x1: + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00 +.endm + +.macro dgemm_dsolve_2x1 + bge ZERO, L, .L_dsolve_2x1_load + dgemm_2x1 + b .L_dsolve_2x1 +.L_dsolve_2x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_2x1: + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 2 + // Strore C + GST v, , $vr0, C0, 0x00 +.endm + +.macro dgemm_dsolve_1x1 + bge ZERO, L, .L_dsolve_1x1_load + dgemm_1x1 + b .L_dsolve_1x1 +.L_dsolve_1x1_load: + // Load C + fld.d $f0, C0, 0x00 +.L_dsolve_1x1: + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 1 + // Strore C + GST f, d, $f0, C0, 0x00 +.endm + + PROLOGUE + push_if_used 26, 32 + PTR_SLLI LDC, LDC, 3 + PTR_SUB KK, ZERO, OFFSET + /* if (!(N >> 2)) goto L_N3 */ + PTR_SRAI J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + beq ZERO, J, .L_N3 +.align 5 +.L_J1: + PTR_ADDI J, J, -1 + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_M15 +.align 4 +.L_I1: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x4 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_I1 +.L_M15: + andi I, M, 8 + beqz I, .L_M7 +.L_M8: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x4 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_M7: + andi I, M, 4 + beqz I, .L_M3 +.L_M4: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x4 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_M3: + andi I, M, 2 + beqz I, .L_M1 +.L_M2: + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x4 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_M1: + andi I, M, 1 + beqz I, .L_M0 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x4 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_M0: + PTR_SLLI T0, K, 5 + PTR_SLLI T1, LDC, 2 + PTR_ADD B, B, T0 // b += 4 * k + PTR_ADD C, C, T1 // c += 4 * ldc + PTR_ADDI KK, KK, 4 // kk += 4 + bnez J, .L_J1 +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 +.L_N2: + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N2_M15 +.align 4 +.L_N2_I1: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x2 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N2_I1 +.L_N2_M15: + andi I, M, 8 + beqz I, .L_N2_M7 +.L_N2_M8: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x2 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N2_M7: + andi I, M, 4 + beqz I, .L_N2_M3 +.L_N2_M4: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x2 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N2_M3: + andi I, M, 2 + beqz I, .L_N2_M1 +.L_N2_M2: + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x2 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N2_M1: + andi I, M, 1 + beqz I, .L_N2_M0 + GADD , d, C0, CC, ZERO, C1, C0, LDC + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x2 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N2_M0: + PTR_SLLI T0, K, 4 + PTR_SLLI T1, LDC, 1 + PTR_ADD B, B, T0 // b += 2 * k + PTR_ADD C, C, T1 // c += 2 * ldc + PTR_ADDI KK, KK, 2 // kk += 2 +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + move AA, A + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N1_M15 +.align 4 +.L_N1_I1: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_16x1 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N1_I1 +.L_N1_M15: + andi I, M, 8 + beqz I, .L_N1_M7 +.L_N1_M8: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_8x1 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N1_M7: + andi I, M, 4 + beqz I, .L_N1_M3 +.L_N1_M4: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_4x1 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N1_M3: + andi I, M, 2 + beqz I, .L_N1_M1 +.L_N1_M2: + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_2x1 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N1_M1: + andi I, M, 1 + beqz I, .L_N1_M0 + GADD , d, C0, CC, ZERO + move A0, AA + move B0, B + move L, KK + dgemm_dsolve_1x1 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N1_M0: +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S new file mode 100644 index 000000000..5f86d75b5 --- /dev/null +++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S @@ -0,0 +1,953 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/09/26 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + * FLOAT *c, BLASLONG ldc, BLASLONG offset) + */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc +#define OFFSET $r11 // param 9: offset + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 +#define T1 $r26 +#define T2 $r27 +#define KK $r28 +#define AA $r29 +#define CC $r30 +#define BB $r31 +#undef ZERO +#define ZERO $r0 + +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +#include "dtrsm_kernel_macro.S" + +.macro ldrepl_macro start, end, stride +// Load Ux (x = 0...15) +.if \start <= \end + GLDREPL xv, d, $xr\start, B0, \stride * 8 + ldrepl_macro %start + 1, \end, %stride + 1 +.endif +.endm + +.macro nmsub_macro start0, end0, start1, reg +// Ux -= reg * Dx +.if \start0 <= \end0 + xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 + nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg +.endif +.endm + +.macro A_st_macro start, end, stride, N +// Store Ux(x = 0...15) +.if \start <= \end +.if \N == 4 + xvst $xr\start, A0, \stride * 0x20 +.elseif \N == 2 + vst $vr\start, A0, \stride * 0x10 +.elseif \N == 1 + fst.d $f\start, A0, \stride * 0x08 +.endif + A_st_macro %start + 1, \end, %stride + 1, \N +.endif +.endm + +.macro dsolve_16x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//2 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 16, 0 + ldrepl_macro 17, 18, 2 + GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 + nmsub_macro 0, 3, 4, D1 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 +// Store A + A_st_macro 0, 7, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ + U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 +.endm + +.macro dsolve_8x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//2 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 16, 0 + ldrepl_macro 17, 18, 2 + GMUL xvf, d, U2, D2, U2, U3, D2, U3 + nmsub_macro 0, 1, 2, D1 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 +// Store A + A_st_macro 0, 3, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ + U2, C1, 0x00, U3, C1, 0x20 +.endm + +.macro dsolve_4x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//2 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 16, 0 + ldrepl_macro 17, 18, 2 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 1, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C1, 0x00 +.endm + +.macro dsolve_2x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//2 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 16, 0 + ldrepl_macro 17, 18, 2 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 1, 0, 2 +// Store C + GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 +.endm + +.macro dsolve_1x2 +// We are going to process matrix B with a size of 2x2, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//2 3 +// Sequentially extract data from B in row order + ldrepl_macro 16, 16, 0 + ldrepl_macro 17, 18, 2 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 1, 0, 1 +// Store C + GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 +.endm + +.macro dsolve_16x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//4 5 +//8 9 10 +//12 13 14 15 +// Sequentially extract data from B in row order + ldrepl_macro 22, 25, 12 + GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 + ldrepl_macro 19, 21, 8 + nmsub_macro 8, 11, 12, D8 + ldrepl_macro 17, 18, 4 + GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11 + ldrepl_macro 16, 16, 0 + nmsub_macro 4, 7, 12, D7 + nmsub_macro 4, 7, 8, D4 + GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 + nmsub_macro 0, 3, 12, D6 + nmsub_macro 0, 3, 8, D3 + nmsub_macro 0, 3, 4, D1 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 +// Store A + A_st_macro 0, 15, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ + U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ + U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \ + U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +.endm + +.macro dsolve_8x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//4 5 +//8 9 10 +//12 13 14 15 +// Sequentially extract data from B in row order + ldrepl_macro 22, 25, 12 + GMUL xvf, d, U6, D9, U6, U7, D9, U7 + ldrepl_macro 19, 21, 8 + nmsub_macro 4, 5, 6, D8 + ldrepl_macro 17, 18, 4 + GMUL xvf, d, U4, D5, U4, U5, D5, U5 + ldrepl_macro 16, 16, 0 + nmsub_macro 2, 3, 6, D7 + nmsub_macro 2, 3, 4, D4 + GMUL xvf, d, U2, D2, U2, U3, D2, U3 + nmsub_macro 0, 1, 6, D6 + nmsub_macro 0, 1, 4, D3 + nmsub_macro 0, 1, 2, D1 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 +// Store A + A_st_macro 0, 7, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ + U2, C1, 0x00, U3, C1, 0x20, \ + U4, C2, 0x00, U5, C2, 0x20, \ + U6, C3, 0x00, U7, C3, 0x20 +.endm + +.macro dsolve_4x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//4 5 +//8 9 10 +//12 13 14 15 +// Sequentially extract data from B in row order + ldrepl_macro 22, 25, 12 + GMUL xvf, d, U3, D9, U3 + ldrepl_macro 19, 21, 8 + nmsub_macro 2, 2, 3, D8 + ldrepl_macro 17, 18, 4 + GMUL xvf, d, U2, D5, U2 + ldrepl_macro 16, 16, 0 + nmsub_macro 1, 1, 3, D7 + nmsub_macro 1, 1, 2, D4 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 3, D6 + nmsub_macro 0, 0, 2, D3 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 3, 0, 4 +// Store C + GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 +.endm + +.macro dsolve_2x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//4 5 +//8 9 10 +//12 13 14 15 +// Sequentially extract data from B in row order + ldrepl_macro 22, 25, 12 + GMUL xvf, d, U3, D9, U3 + ldrepl_macro 19, 21, 8 + nmsub_macro 2, 2, 3, D8 + ldrepl_macro 17, 18, 4 + GMUL xvf, d, U2, D5, U2 + ldrepl_macro 16, 16, 0 + nmsub_macro 1, 1, 3, D7 + nmsub_macro 1, 1, 2, D4 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 3, D6 + nmsub_macro 0, 0, 2, D3 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 3, 0, 2 +// Store C + GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00 +.endm + +.macro dsolve_1x4 +// We are going to process matrix B with a size of 4x4, +// using only the upper triangular portion. The memory layout of +// matrix B is as follows: +//0 +//4 5 +//8 9 10 +//12 13 14 15 +// Sequentially extract data from B in row order + ldrepl_macro 22, 25, 12 + GMUL xvf, d, U3, D9, U3 + ldrepl_macro 19, 21, 8 + nmsub_macro 2, 2, 3, D8 + ldrepl_macro 17, 18, 4 + GMUL xvf, d, U2, D5, U2 + ldrepl_macro 16, 16, 0 + nmsub_macro 1, 1, 3, D7 + nmsub_macro 1, 1, 2, D4 + GMUL xvf, d, U1, D2, U1 + nmsub_macro 0, 0, 3, D6 + nmsub_macro 0, 0, 2, D3 + nmsub_macro 0, 0, 1, D1 + GMUL xvf, d, U0, D0, U0 +// Store A + A_st_macro 0, 3, 0, 1 +// Store C + GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, +.endm + +.macro dgemm_dsolve_16x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_16x1_load + dgemm_16x1 + b .L_dsolve_16x1 +.L_dsolve_16x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 +.L_dsolve_16x1: + PTR_ADDI A0, T1, -16 * 8 + PTR_ADDI B0, T2, -1 * 8 + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 + // Store A + A_st_macro 0, 3, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 +.endm + +.macro dgemm_dsolve_8x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_8x1_load + dgemm_8x1 + b .L_dsolve_8x1 +.L_dsolve_8x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 +.L_dsolve_8x1: + PTR_ADDI A0, T1, -8 * 8 + PTR_ADDI B0, T2, -1 * 8 + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0, U1, D0, U1 + // Store A + A_st_macro 0, 1, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00, U1, C0, 0x20 +.endm + +.macro dgemm_dsolve_4x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_4x1_load + dgemm_4x1 + b .L_dsolve_4x1 +.L_dsolve_4x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_4x1: + PTR_ADDI A0, T1, -4 * 8 + PTR_ADDI B0, T2, -1 * 8 + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 4 + // Strore C + GST xv, , U0, C0, 0x00 +.endm + +.macro dgemm_dsolve_2x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_2x1_load + dgemm_2x1 + b .L_dsolve_2x1 +.L_dsolve_2x1_load: + /* Load C0 */ + xvld U0, C0, 0x00 +.L_dsolve_2x1: + PTR_ADDI A0, T1, -2 * 8 + PTR_ADDI B0, T2, -1 * 8 + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 2 + // Strore C + GST v, , $vr0, C0, 0x00 +.endm + +.macro dgemm_dsolve_1x1 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_1x1_load + dgemm_1x1 + b .L_dsolve_1x1 +.L_dsolve_1x1_load: + // Load C + fld.d $f0, C0, 0x00 +.L_dsolve_1x1: + PTR_ADDI A0, T1, -1 * 8 + PTR_ADDI B0, T2, -1 * 8 + ldrepl_macro 16, 16, 0 + GMUL xvf, d, U0, D0, U0 + // Store A + A_st_macro 0, 0, 0, 1 + // Strore C + GST f, d, $f0, C0, 0x00 +.endm + +.macro dgemm_dsolve_16x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_16x2_load + dgemm_16x2 + b .L_dsolve_16x2 +.L_dsolve_16x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + /* Load C1 */ + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 +.L_dsolve_16x2: + PTR_ADDI A0, T1, -(16 * 2) * 8 + PTR_ADDI B0, T2, -(2 * 2) * 8 + dsolve_16x2 +.endm + +.macro dgemm_dsolve_8x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_8x2_load + dgemm_8x2 + b .L_dsolve_8x2 +.L_dsolve_8x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 +.L_dsolve_8x2: + PTR_ADDI A0, T1, -(8 * 2) * 8 + PTR_ADDI B0, T2, -(2 * 2) * 8 + dsolve_8x2 +.endm + +.macro dgemm_dsolve_4x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_4x2_load + dgemm_4x2 + b .L_dsolve_4x2 +.L_dsolve_4x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_4x2: + PTR_ADDI A0, T1, -(4 * 2) * 8 + PTR_ADDI B0, T2, -(2 * 2) * 8 + dsolve_4x2 +.endm + +.macro dgemm_dsolve_2x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_2x2_load + dgemm_2x2 + b .L_dsolve_2x2 +.L_dsolve_2x2_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 +.L_dsolve_2x2: + PTR_ADDI A0, T1, -(2 * 2) * 8 + PTR_ADDI B0, T2, -(2 * 2) * 8 + dsolve_2x2 +.endm + +.macro dgemm_dsolve_1x2 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_1x2_load + dgemm_1x2 + xvpackod.d U1, U0, U0 + b .L_dsolve_1x2 +.L_dsolve_1x2_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 +.L_dsolve_1x2: + PTR_ADDI A0, T1, -(1 * 2) * 8 + PTR_ADDI B0, T2, -(2 * 2) * 8 + dsolve_1x2 +.endm + +.macro dgemm_dsolve_16x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_16x4_load + dgemm_16x4 + b .L_dsolve_16x4 +.L_dsolve_16x4_load: + // Load C + GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 + GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 + GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 + GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 +/********************** solver ******************/ +.L_dsolve_16x4: + PTR_ADDI A0, T1, -(16 * 4) * 8 + PTR_ADDI B0, T2, -(4 * 4) * 8 + dsolve_16x4 +.endm + +.macro dgemm_dsolve_8x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_8x4_load + dgemm_8x4 + b .L_dsolve_8x4 +.L_dsolve_8x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + + /* Load C2 */ + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + + /* Load C3 */ + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 +/********* solver *********/ +.L_dsolve_8x4: + PTR_ADDI A0, T1, -(8 * 4) * 8 + PTR_ADDI B0, T2, -(4 * 4) * 8 + dsolve_8x4 +.endm + +.macro dgemm_dsolve_4x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_4x4_load + dgemm_4x4 + b .L_dsolve_4x4 +.L_dsolve_4x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/************** solver *****************/ +.L_dsolve_4x4: + PTR_ADDI A0, T1, -(4 * 4) * 8 + PTR_ADDI B0, T2, -(4 * 4) * 8 + dsolve_4x4 +.endm + +.macro dgemm_dsolve_2x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_2x4_load + dgemm_2x4 + xvpermi.q U2, U0, 0x01 + xvpermi.q U3, U1, 0x01 + b .L_dsolve_2x4 +.L_dsolve_2x4_load: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 +/********************** solver ******************/ +.L_dsolve_2x4: + PTR_ADDI A0, T1, -(2 * 4) * 8 + PTR_ADDI B0, T2, -(4 * 4) * 8 + dsolve_2x4 +.endm + +.macro dgemm_dsolve_1x4 + or T1, A0, A0 + or T2, B0, B0 + bge ZERO, L, .L_dsolve_1x4_load + dgemm_1x4 + xvpackod.d U1, U0, U0 + xvpermi.q U2, U0, 0x01 + xvpermi.q U3, U1, 0x01 + b .L_dsolve_1x4 +.L_dsolve_1x4_load: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + fld.d $f2, C2, 0x00 + fld.d $f3, C3, 0x00 +.L_dsolve_1x4: + PTR_ADDI A0, T1, -(1 * 4) * 8 + PTR_ADDI B0, T2, -(4 * 4) * 8 + dsolve_1x4 +.endm + + PROLOGUE + push_if_used 26, 32 + PTR_SLLI LDC, LDC, 3 + PTR_SUB KK, N, OFFSET + PTR_MUL T0, N, LDC + PTR_MUL T1, N, K + PTR_ADD C, C, T0 // c += n * ldc + PTR_SLLI T1, T1, 3 + PTR_ADD B, B, T1 + + andi J, N, 1 + beqz J, .L_N2 +.L_N1: + move AA, A + PTR_SUB C, C, LDC // c -= ldc + PTR_SLLI T0, K, 3 + PTR_SLLI T1, KK, 3 + PTR_SUB B, B, T0 // b -= k + PTR_ADD BB, B, T1 // bb = b + kk + move CC, C + + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N1_M15 +.align 4 +.L_N1_I1: + PTR_SLLI T1, KK, 7 + GADD , d, C0, CC, ZERO + PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_16x1 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N1_I1 +.L_N1_M15: + andi I, M, 8 + beqz I, .L_N1_M7 +.L_N1_M8: + PTR_SLLI T1, KK, 6 + GADD , d, C0, CC, ZERO + PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_8x1 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N1_M7: + andi I, M, 4 + beqz I, .L_N1_M3 +.L_N1_M4: + PTR_SLLI T1, KK, 5 + GADD , d, C0, CC, ZERO + PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_4x1 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N1_M3: + andi I, M, 2 + beqz I, .L_N1_M1 +.L_N1_M2: + PTR_SLLI T1, KK, 4 + GADD , d, C0, CC, ZERO + PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_2x1 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N1_M1: + andi I, M, 1 + beqz I, .L_N1_M0 + PTR_SLLI T1, KK, 3 + GADD , d, C0, CC, ZERO + PTR_ADD A0, AA, T1 // a0 = aa + kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_1x1 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N1_M0: + PTR_ADDI KK, KK, -1 +.L_N2: + andi J, N, 2 + beq ZERO, J, .L_N4 + move AA, A + PTR_SLLI T0, LDC, 1 + PTR_SLLI T1, K, 4 + PTR_SLLI T2, KK, 4 + PTR_SUB B, B, T1 + PTR_SUB C, C, T0 + PTR_ADD BB, B, T2 + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_N2_M15 +.align 4 +.L_N2_I1: + PTR_SLLI T1, KK, 7 + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_16x2 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_N2_I1 +.L_N2_M15: + andi I, M, 8 + beqz I, .L_N2_M7 +.L_N2_M8: + PTR_SLLI T1, KK, 6 + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_8x2 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_N2_M7: + andi I, M, 4 + beqz I, .L_N2_M3 +.L_N2_M4: + PTR_SLLI T1, KK, 5 + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_4x2 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_N2_M3: + andi I, M, 2 + beqz I, .L_N2_M1 +.L_N2_M2: + PTR_SLLI T1, KK, 4 + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_2x2 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_N2_M1: + andi I, M, 1 + beqz I, .L_N2_M0 + PTR_SLLI T1, KK, 3 + GADD , d, C0, CC, ZERO, C1, C0, LDC + PTR_ADD A0, AA, T1 // a0 = aa + kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_1x2 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_N2_M0: + PTR_ADDI KK, KK, -2 +.L_N4: + PTR_SRAI J, N, 2 /* J = bn >> 2 */ + beq ZERO, J, .L_N0 +.align 5 +.L_J1: + PTR_ADDI J, J, -1 + move AA, A + PTR_SLLI T0, LDC, 2 + PTR_SLLI T1, K, 5 + PTR_SLLI T2, KK, 5 + PTR_SUB B, B, T1 + PTR_SUB C, C, T0 + PTR_ADD BB, B, T2 + move CC, C + PTR_SRAI I, M, 4 // M >> 4 + beqz I, .L_M15 +.align 4 +.L_I1: + PTR_SLLI T1, KK, 7 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_16x4 + PTR_ADDI I, I, -1 + PTR_SLLI T0, K, 7 + PTR_ADDI CC, CC, 0x80 // cc += 16 + PTR_ADD AA, AA, T0 // aa += 16 * k + bnez I, .L_I1 +.L_M15: + andi I, M, 8 + beqz I, .L_M7 +.L_M8: + PTR_SLLI T1, KK, 6 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_8x4 + PTR_SLLI T0, K, 6 + PTR_ADDI CC, CC, 0x40 // cc += 8 + PTR_ADD AA, AA, T0 // aa += 8 * k +.L_M7: + andi I, M, 4 + beqz I, .L_M3 +.L_M4: + PTR_SLLI T1, KK, 5 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_4x4 + PTR_SLLI T0, K, 5 + PTR_ADDI CC, CC, 0x20 // cc += 4 + PTR_ADD AA, AA, T0 // aa += 4 * k +.L_M3: + andi I, M, 2 + beqz I, .L_M1 +.L_M2: + PTR_SLLI T1, KK, 4 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_2x4 + PTR_SLLI T0, K, 4 + PTR_ADDI CC, CC, 0x10 // cc += 2 + PTR_ADD AA, AA, T0 // aa += 2 * k +.L_M1: + andi I, M, 1 + beqz I, .L_M0 + PTR_SLLI T1, KK, 3 + GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC + PTR_ADD A0, AA, T1 // a0 = aa + kk + move B0, BB + PTR_SUB L, K, KK // L = K - KK + dgemm_dsolve_1x4 + PTR_SLLI T0, K, 3 + PTR_ADDI CC, CC, 0x08 // cc += 1 + PTR_ADD AA, AA, T0 // aa += 1 * k +.L_M0: + PTR_ADDI KK, KK, -4 + bnez J, .L_J1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dtrsm_kernel_macro.S b/kernel/loongarch64/dtrsm_kernel_macro.S new file mode 100644 index 000000000..88b7121d1 --- /dev/null +++ b/kernel/loongarch64/dtrsm_kernel_macro.S @@ -0,0 +1,2147 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/************** Dgemm Kernel 16x4 ****************/ +.macro KERNEL2x16x4 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x16x4_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 +.endm + +.macro KERNEL8x16x4 +.rept 4 + KERNEL2x16x4 +.endr +.endm + +.macro KERNEL8x16x4_END +.rept 3 + KERNEL2x16x4 +.endr + KERNEL2x16x4_END +.endm + +.macro KERNEL2x8x4 + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x8x4_END + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 +.endm + +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm + +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + xvld U8, A0, 0x00 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D12, U0, U7, D12 +.endm + +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm + +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm + +.macro KERNEL2x2x4 + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x4_END + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm + +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm + +.macro KERNEL2x1x4 + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + xvldrepl.d U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvld U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x1x4_END + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x16x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x16x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 +.endm + +.macro KERNEL8x16x2 +.rept 4 + KERNEL2x16x2 +.endr +.endm + +.macro KERNEL8x16x2_END +.rept 3 + KERNEL2x16x2 +.endr + KERNEL2x16x2_END +.endm + +.macro KERNEL2x8x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 +.endm + +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm + +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 + .endr + KERNEL2x8x2_END +.endm + +.macro KERNEL2x4x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x4x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm + +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm + +.macro KERNEL2x2x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x2x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x1x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x16x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x16x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 +.endm + +.macro KERNEL8x16x1 +.rept 4 + KERNEL2x16x1 +.endr +.endm + +.macro KERNEL8x16x1_END +.rept 3 + KERNEL2x16x1 +.endr + KERNEL2x16x1_END +.endm + +.macro KERNEL2x8x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvld U9, A0, 0x20 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x8x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm + +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm + +.macro KERNEL2x4x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x4x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm + +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm + +.macro KERNEL2x2x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x2x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm + +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm + +.macro KERNEL2x1x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x1x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm + +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm + +.macro dgemm_16x4 +.L_dgemm_16x4: // See dgemm_kernel_16x4.S + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 + + xvldrepl.d U6, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 + xvfmul.d D10, U2, U6 + xvfmul.d D11, U3, U6 + + xvldrepl.d U7, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 + xvfmul.d D14, U2, U7 + xvfmul.d D15, U3, U7 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_dgemm_16x4_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x20 + + beq ZERO, TL, .L_dgemm_16x4_TL1_END +.align 5 +.L_dgemm_16x4_TL1: + KERNEL8x16x4 + PTR_ADDI TL, TL, -1 + blt ZERO, TL, .L_dgemm_16x4_TL1 +.L_dgemm_16x4_TL1_END: + KERNEL8x16x4_END +.L_dgemm_16x4_L7: + andi TL, L, 7 + beq TL, ZERO, .L_dgemm_16x4_L0 +.align 5 +.L_dgemm_16x4_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 + + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x20 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_16x4_L71 +.L_dgemm_16x4_L0: + // Load C + GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 + GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 + GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 + GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 + GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \ + U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7, \ + U8, U8, D8, U9, U9, D9, U10, U10, D10, U11, U11, D11, \ + U12, U12, D12, U13, U13, D13, U14, U14, D14, U15, U15, D15 +.endm + +.macro dgemm_1x4 +.L_dgemm_1x4: // See dgemm_kernel_16x4.S + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x08 + PTR_ADDI B0, B0, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_dgemm_1x4_M1_L7 + xvldrepl.d U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + xvld U12, B0, 0x00 + PTR_ADDI A0, A0, 0x08 + PTR_ADDI B0, B0, 0x20 + + beq ZERO, TL, .L_dgemm_1x4_M1_TL1_END +.align 5 +.L_dgemm_1x4_M1_TL1: + KERNEL8x1x4 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_1x4_M1_TL1 +.L_dgemm_1x4_M1_TL1_END: + KERNEL8x1x4_END +.L_dgemm_1x4_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_1x4_M1_L0 +.align 5 +.L_dgemm_1x4_M1_L71: + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x08 + PTR_ADDI B0, B0, 0x20 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_1x4_M1_L71 +.L_dgemm_1x4_M1_L0: + // Load C + fld.d $f0, C0, 0x00 + fld.d $f1, C1, 0x00 + fld.d $f2, C2, 0x00 + fld.d $f3, C3, 0x00 + xvinsve0.d U0, U1, 0x01 + xvinsve0.d U0, U2, 0x02 + xvinsve0.d U0, U3, 0x03 + GSUB xvf, d, U0, U0, D0 +.endm + +.macro dgemm_2x4 +.L_dgemm_2x4: + /* Load 2 * 64 from A0 */ + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + xvld U4, B0, 0x00 + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_dgemm_2x4_M2_L7 + + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 + + PTR_ADDI TL, TL, -1 + + xvld U12, B0, 0x00 + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x20 + + beq ZERO, TL, .L_dgemm_2x4_M2_TL1_END +.align 5 +.L_dgemm_2x4_M2_TL1: + KERNEL8x2x4 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_2x4_M2_TL1 +.L_dgemm_2x4_M2_TL1_END: + KERNEL8x2x4_END + +.L_dgemm_2x4_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_2x4_M2_L0 +.align 5 +.L_dgemm_2x4_M2_L71: + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvld U4, B0, 0x00 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x20 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_2x4_M2_L71 +.L_dgemm_2x4_M2_L0: + xvpackev.d D4, D1, D0 + xvpackod.d D5, D1, D0 + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U1, U3, 0x02 + + GSUB xvf, d, U0, U0, D4, U1, U1, D5 +.endm + +.macro dgemm_4x4 +.L_dgemm_4x4: + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + + xvldrepl.d U6, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U6 + + xvldrepl.d U7, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U7 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_dgemm_4x4_M4_L7 + + xvld U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x20 + + beq ZERO, TL, .L_dgemm_4x4_M4_TL1_END +.align 5 +.L_dgemm_4x4_M4_TL1: /* TL-- */ + KERNEL8x4x4 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_4x4_M4_TL1 +.L_dgemm_4x4_M4_TL1_END: + KERNEL8x4x4_END +.L_dgemm_4x4_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_4x4_M4_L0 +.align 5 +.L_dgemm_4x4_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x20 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_4x4_M4_L71 + .L_dgemm_4x4_M4_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 + + GSUB xvf, d, U0, U0, D0, U1, U1, D4, U2, U2, D8, U3, U3, D12 +.endm + +.macro dgemm_8x4 +.L_dgemm_8x4: + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + + xvldrepl.d U6, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 + + xvldrepl.d U7, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_dgemm_8x4_M8_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 + + beq ZERO, TL, .L_dgemm_8x4_M8_TL1_END +.align 5 +.L_dgemm_8x4_M8_TL1: /* TL-- */ + KERNEL8x8x4 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_8x4_M8_TL1 + +.L_dgemm_8x4_M8_TL1_END: + KERNEL8x8x4_END + +.L_dgemm_8x4_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_8x4_M8_L0 +.align 5 +.L_dgemm_8x4_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_8x4_M8_L71 +.L_dgemm_8x4_M8_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + + /* Load C2 */ + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + + /* Load C3 */ + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 + + GSUB xvf, d, U0, U0, D0, U1, U1, D1, \ + U2, U2, D4, U3, U3, D5, \ + U4, U4, D8, U5, U5, D9, \ + U6, U6, D12, U7, U7, D13 +.endm + +.macro dgemm_4x2 +.L_dgemm_4x2: + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_4x2_N3_M4_L7 */ + beq ZERO,TL, .L_dgemm_4x2_N3_M4_L7 + + xvld U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x10 + + beq ZERO, TL, .L_dgemm_4x2_N3_M4_TL1_END +.align 5 +.L_dgemm_4x2_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_4x2_N3_M4_TL1 +.L_dgemm_4x2_N3_M4_TL1_END: + KERNEL8x4x2_END + +.L_dgemm_4x2_N3_M4_L7: + /* if (!(L & 7)) goto L_dgemm_4x2_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_4x2_N3_M4_L0 +.align 5 +.L_dgemm_4x2_N3_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_4x2_N3_M4_L71 + +.L_dgemm_4x2_N3_M4_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + GSUB xvf, d, U0, U0, D0, U1, U1, D4 +.endm + +.macro dgemm_2x2 +.L_dgemm_2x2: + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_2x2_N3_M2_L7 */ + beq ZERO,TL, .L_dgemm_2x2_N3_M2_L7 + + xvld U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x10 + + beq ZERO, TL, .L_dgemm_2x2_N3_M2_TL1_END +.align 5 +.L_dgemm_2x2_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_2x2_N3_M2_TL1 +.L_dgemm_2x2_N3_M2_TL1_END: + KERNEL8x2x2_END + +.L_dgemm_2x2_N3_M2_L7: + /* if (!(L & 7)) goto L_dgemm_2x2_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_2x2_N3_M2_L0 +.align 5 +.L_dgemm_2x2_N3_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_2x2_N3_M2_L71 +.L_dgemm_2x2_N3_M2_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + GSUB xvf, d, U0, U0, D0, U1, U1, D4 +.endm + +.macro dgemm_8x2 +.L_dgemm_8x2: + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_8x2_N3_M8_L7 */ + beq ZERO,TL, .L_dgemm_8x2_N3_M8_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 + + beq ZERO, TL, .L_dgemm_8x2_N3_M8_TL1_END +.align 5 +.L_dgemm_8x2_N3_M8_TL1: /* TL-- */ + KERNEL8x8x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_8x2_N3_M8_TL1 +.L_dgemm_8x2_N3_M8_TL1_END: + KERNEL8x8x2_END + +.L_dgemm_8x2_N3_M8_L7: + /* if (!(L & 7)) goto L_dgemm_8x2_N3_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_8x2_N3_M8_L0 +.align 5 +.L_dgemm_8x2_N3_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_8x2_N3_M8_L71 + +.L_dgemm_8x2_N3_M8_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + /* Load C1 */ + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D4, U3, U3, D5 +.endm + +.macro dgemm_16x2 +.L_dgemm_16x2: + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_dgemm_16x2_N3_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x10 + + beq ZERO, TL, .L_dgemm_16x2_N3_TL1_END +.align 5 +.L_dgemm_16x2_N3_TL1: /* TL-- */ + KERNEL8x16x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_16x2_N3_TL1 +.L_dgemm_16x2_N3_TL1_END: + KERNEL8x16x2_END + +.L_dgemm_16x2_N3_L7: + /* if (!(L & 7)) goto L_dgemm_16x2_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_16x2_N3_L0 +.align 5 +.L_dgemm_16x2_N3_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_16x2_N3_L71 + +.L_dgemm_16x2_N3_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + /* Load C1 */ + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 + GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \ + U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7 +.endm + +.macro dgemm_2x1 +.L_dgemm_2x1: + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_2x1_N1_M2_L7 */ + beq ZERO,TL, .L_dgemm_2x1_N1_M2_L7 + + xvld U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x08 + + beq ZERO, TL, .L_dgemm_2x1_N1_M2_TL1_END +.align 5 +.L_dgemm_2x1_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_2x1_N1_M2_TL1 +.L_dgemm_2x1_N1_M2_TL1_END: + KERNEL8x2x1_END + +.L_dgemm_2x1_N1_M2_L7: + /* if (!(L & 7)) goto L_dgemm_2x1_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_2x1_N1_M2_L0 +.align 5 +.L_dgemm_2x1_N1_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x10 + PTR_ADDI B0, B0, 0x08 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_2x1_N1_M2_L71 +.L_dgemm_2x1_N1_M2_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + GSUB xvf, d, U0, U0, D0 +.endm + +.macro dgemm_4x1 +.L_dgemm_4x1: + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_4x1_N1_M4_L7 */ + beq ZERO,TL, .L_dgemm_4x1_N1_M4_L7 + + xvld U8, A0, 0x00 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x08 + + beq ZERO, TL, .L_dgemm_4x1_N1_M4_TL1_END +.align 5 +.L_dgemm_4x1_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_4x1_N1_M4_TL1 +.L_dgemm_4x1_N1_M4_TL1_END: + KERNEL8x4x1_END + +.L_dgemm_4x1_N1_M4_L7: + /* if (!(L & 7)) goto L_dgemm_4x1_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_4x1_N1_M4_L0 +.align 5 +.L_dgemm_4x1_N1_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x20 + PTR_ADDI B0, B0, 0x08 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_4x1_N1_M4_L71 +.L_dgemm_4x1_N1_M4_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + GSUB xvf, d, U0, U0, D0 +.endm + +.macro dgemm_8x1 +.L_dgemm_8x1: + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_8x1_N1_M8_L7 */ + beq ZERO,TL, .L_dgemm_8x1_N1_M8_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 + + beq ZERO, TL, .L_dgemm_8x1_N1_M8_TL1_END +.align 5 +.L_dgemm_8x1_N1_M8_TL1: /* TL-- */ + KERNEL8x8x1 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_8x1_N1_M8_TL1 + +.L_dgemm_8x1_N1_M8_TL1_END: + KERNEL8x8x1_END + +.L_dgemm_8x1_N1_M8_L7: + /* if (!(L & 7)) goto L_dgemm_8x1_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_8x1_N1_M8_L0 +.align 5 +.L_dgemm_8x1_N1_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_8x1_N1_M8_L71 +.L_dgemm_8x1_N1_M8_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + GSUB xvf, d, U0, U0, D0, U1, U1, D1 +.endm + +.macro dgemm_16x1 +.L_dgemm_16x1: + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_dgemm_16x1_N1_L7 */ + beq ZERO,TL, .L_dgemm_16x1_N1_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + PTR_ADDI TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x08 + + beq ZERO, TL, .L_dgemm_16x1_N1_TL1_END +.align 5 +.L_dgemm_16x1_N1_TL1: /* TL-- */ + KERNEL8x16x1 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_16x1_N1_TL1 +.L_dgemm_16x1_N1_TL1_END: + KERNEL8x16x1_END + +.L_dgemm_16x1_N1_L7: + /* if (!(L & 7)) goto L_dgemm_16x1_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_16x1_N1_L0 +.align 5 +.L_dgemm_16x1_N1_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + PTR_ADDI A0, A0, 0x80 + PTR_ADDI B0, B0, 0x08 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_dgemm_16x1_N1_L71 +.L_dgemm_16x1_N1_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3 +.endm + +.macro dgemm_1x2 +.L_dgemm_1x2: // See dgemm_kernel_16x4.S + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_dgemm_1x2_N3_M1_L7 + + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + beq ZERO, TL, .L_dgemm_1x2_N3_M1_TL1_END +.L_dgemm_1x2_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_dgemm_1x2_N3_M1_TL1 +.L_dgemm_1x2_N3_M1_TL1_END: + KERNEL8x1x2_END +.L_dgemm_1x2_N3_M1_L7: + /* if (!(L & 7)) goto L_dgemm_1x2_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_dgemm_1x2_N3_M1_L0 +.L_dgemm_1x2_N3_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_dgemm_1x2_N3_M1_L71 +.L_dgemm_1x2_N3_M1_L0: + xvld U0, C0, 0x00 + xvld U1, C1, 0x00 + xvinsve0.d U0, U1, 0x01 + xvinsve0.d D0, D4, 0x01 + GSUB xvf, d, U0, U0, D0 +.endm + +.macro dgemm_1x1 +.L_dgemm_1x1: + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M1_TL1_END +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 +.L_N1_M1_L0: + /* Load C0 */ + xvld U0, C0, 0x00 + GSUB xvf, d, U0, U0, D0 +.endm From d15e0a055cf73502f0639cec829a5396a780c753 Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 27 Sep 2023 09:40:40 +0800 Subject: [PATCH 310/718] LoongArch64: Fixed compilation issues when enable DYNAMIC_ARCH --- kernel/loongarch64/dgemv_n_8_lasx.S | 10 +++++----- kernel/loongarch64/dgemv_t_8_lasx.S | 6 +++--- kernel/loongarch64/sgemv_n_8_lasx.S | 10 +++++----- kernel/loongarch64/sgemv_t_8_lasx.S | 6 +++--- lapack/laswp/loongarch64/Makefile | 5 +++++ 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S index c6523f9ab..a49bf9bb1 100644 --- a/kernel/loongarch64/dgemv_n_8_lasx.S +++ b/kernel/loongarch64/dgemv_n_8_lasx.S @@ -341,7 +341,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmadd.d $f10, $f12, $f2, $f10 .endm -.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req +.macro DGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req PTR_SRLI J, N, 3 beqz J, .L_\XW\()_N_7 PTR_SLLI K_LDA, LDA, 3 @@ -541,13 +541,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .hword .L_GAP_1_0 - .L_GAP_TABLE .hword .L_GAP_1_1 - .L_GAP_TABLE .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ - DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 + DGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ - DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 + DGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ - DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 + DGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ - DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 + DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 .L_END: pop_if_used 17 + 7, 24 + 4 jirl $r0, $r1, 0x0 diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S index 7f57c1d88..71f942b0f 100644 --- a/kernel/loongarch64/dgemv_t_8_lasx.S +++ b/kernel/loongarch64/dgemv_t_8_lasx.S @@ -220,7 +220,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 .endm -.macro DGEMV_T XW:req X8:req, X4:req +.macro DGEMV_T_LASX XW:req X8:req, X4:req PTR_SRLI J, N, 3 beqz J, .L_\XW\()_N_7 PTR_SLLI K_LDA, LDA, 3 @@ -472,9 +472,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .hword .L_GAP_0 - .L_GAP_TABLE .hword .L_GAP_1 - .L_GAP_TABLE .L_GAP_0: /* if (incx == 1) */ - DGEMV_T GAP_0, X8, X4 + DGEMV_T_LASX GAP_0, X8, X4 .L_GAP_1: /* if (incx != 1) */ - DGEMV_T GAP_1, X8_GAP, X4_GAP + DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP .L_END: pop_if_used 17 + 8, 24 + 3 jirl $r0, $r1, 0x0 diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S index da172ca50..52ffc320e 100644 --- a/kernel/loongarch64/sgemv_n_8_lasx.S +++ b/kernel/loongarch64/sgemv_n_8_lasx.S @@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GST f, s, Y0_F, Y, 0 .endm -.macro SGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req +.macro SGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req PTR_SRLI J, N, 3 beqz J, .L_\XW\()_N_7 PTR_SLLI K_LDA, LDA, 3 @@ -450,13 +450,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .hword .L_GAP_1_0 - .L_GAP_TABLE .hword .L_GAP_1_1 - .L_GAP_TABLE .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ - SGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 + SGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ - SGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 + SGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ - SGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 + SGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ - SGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 + SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 .L_END: pop_if_used 17 + 7, 19 jirl $r0, $r1, 0x0 diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S index dde3f4a30..f4bfffb42 100644 --- a/kernel/loongarch64/sgemv_t_8_lasx.S +++ b/kernel/loongarch64/sgemv_t_8_lasx.S @@ -160,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1 .endm -.macro SGEMV_T XW:req X8:req, X4:req +.macro SGEMV_T_LASX XW:req X8:req, X4:req PTR_SRLI J, N, 3 beqz J, .L_\XW\()_N_7 PTR_SLLI K_LDA, LDA, 3 @@ -396,9 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .hword .L_GAP_0 - .L_GAP_TABLE .hword .L_GAP_1 - .L_GAP_TABLE .L_GAP_0: /* if (incx == 1) */ - SGEMV_T GAP_0, X8, X4 + SGEMV_T_LASX GAP_0, X8, X4 .L_GAP_1: /* if (incx != 1) */ - SGEMV_T GAP_1, X8_GAP, X4_GAP + SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP .L_END: pop_if_used 17 + 8, 18 jirl $r0, $r1, 0x0 diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile index 71e5a87cb..1c85667ec 100644 --- a/lapack/laswp/loongarch64/Makefile +++ b/lapack/laswp/loongarch64/Makefile @@ -1,6 +1,11 @@ TOPDIR = ../../.. include ../../../Makefile.system +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + ifndef LASWP LASWP = ../generic/laswp_k.c endif From 211bb35dee47ba6ade41a28171738f62e5621f4b Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 27 Sep 2023 10:10:41 +0800 Subject: [PATCH 311/718] gh-actions: Adding DYNAMIC_ARCH test for LoongArch64 --- .github/workflows/loongarch64.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index 5501e98e0..e0236ca86 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -18,6 +18,9 @@ jobs: - target: LOONGSON2K1000 triple: loongarch64-unknown-linux-gnu opts: NO_SHARED=1 TARGET=LOONGSON2K1000 + - target: DYNAMIC_ARCH + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC steps: - name: Checkout repository From a92dc25fb3a257ad418da2d1227b092db49bc2b4 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 29 Sep 2023 08:08:29 -0500 Subject: [PATCH 312/718] Fix Makefile.power for xlf --- Makefile.power | 4 ++++ f_check | 3 +++ 2 files changed, 7 insertions(+) diff --git a/Makefile.power b/Makefile.power index 33702c932..46afb2d4a 100644 --- a/Makefile.power +++ b/Makefile.power @@ -70,8 +70,12 @@ else FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif else +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif +endif else FCOMMON_OPT += -O2 -Mrecursive endif diff --git a/f_check b/f_check index f30231bc4..31f4376d0 100755 --- a/f_check +++ b/f_check @@ -117,6 +117,9 @@ else vendor=PGI openmp='-mp' ;; + *xlf*) + vendor=IBM + ;; *) vendor=G77 openmp='' From a69367c43b28e2e0029d42092e791415565fe804 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 29 Sep 2023 09:29:41 -0500 Subject: [PATCH 313/718] Fix Makefile.system for OpenXL. --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index ae6db40b0..af840f029 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1170,6 +1170,8 @@ CCOMMON_OPT += -DF_INTERFACE_IBM FEXTRALIB += -lxlf90 ifeq ($(C_COMPILER), GCC) FCOMMON_OPT += -qextname +else ifeq ($(C_COMPILER), CLANG) +FCOMMON_OPT += -qextname endif # FCOMMON_OPT += -qarch=440 ifdef BINARY64 From a11e1e10f436f4601275669bdf7b951e6e8df0e8 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 29 Sep 2023 10:32:34 -0500 Subject: [PATCH 314/718] powerpc: Fix build errors with xlf This patch fixes errors when using xlf as fortran compiler on Linux. Tested with gcc/xlf and clang/xlf compiler combinations. --- Makefile.power | 4 ++++ Makefile.system | 2 +- f_check | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Makefile.power b/Makefile.power index 33702c932..46afb2d4a 100644 --- a/Makefile.power +++ b/Makefile.power @@ -70,8 +70,12 @@ else FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif else +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif +endif else FCOMMON_OPT += -O2 -Mrecursive endif diff --git a/Makefile.system b/Makefile.system index ae6db40b0..b1a357fdf 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1168,7 +1168,7 @@ endif ifeq ($(F_COMPILER), IBM) CCOMMON_OPT += -DF_INTERFACE_IBM FEXTRALIB += -lxlf90 -ifeq ($(C_COMPILER), GCC) +ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG)) FCOMMON_OPT += -qextname endif # FCOMMON_OPT += -qarch=440 diff --git a/f_check b/f_check index f30231bc4..31f4376d0 100755 --- a/f_check +++ b/f_check @@ -117,6 +117,9 @@ else vendor=PGI openmp='-mp' ;; + *xlf*) + vendor=IBM + ;; *) vendor=G77 openmp='' From ccbb91e4a739ca05b3cefb130863f89dcc1e92f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 12:46:34 +0200 Subject: [PATCH 315/718] fix improper function prototypes (empty parentheses) --- common_thread.h | 20 ++++++++++---------- cpuid_x86.c | 10 +++++----- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/common_thread.h b/common_thread.h index 06a7a1a38..70c724597 100644 --- a/common_thread.h +++ b/common_thread.h @@ -141,14 +141,14 @@ static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP int openmp_nthreads; - openmp_nthreads=omp_get_max_threads(); + openmp_nthreads=omp_get_max_threads(void); #endif #ifndef USE_OPENMP if (blas_cpu_number == 1 #endif #ifdef USE_OPENMP - if (openmp_nthreads == 1 || omp_in_parallel() + if (openmp_nthreads == 1 || omp_in_parallel(void) #endif ) return 1; @@ -192,27 +192,27 @@ int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer); int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int threads); + void *c, BLASLONG ldc, int (*function)(void), int threads); -int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); +int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *, void *, BLASLONG); -int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); +int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG), void *, void *, BLASLONG); -int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); +int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *, void *, BLASLONG); -int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG, BLASLONG); +int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *, void *, BLASLONG, BLASLONG); int trsm_thread(int mode, BLASLONG m, BLASLONG n, double alpha_r, double alpha_i, void *a, BLASLONG lda, - void *c, BLASLONG ldc, int (*function)(), void *buffer); + void *c, BLASLONG ldc, int (*function)(void), void *buffer); -int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); +int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*, FLOAT *, FLOAT *, BLASLONG), void*, void*, BLASLONG); int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *offsetA, BLASLONG lda, void *offsetB, BLASLONG jb, - void *ipiv, BLASLONG offset, int (*function)(), void *buffer); + void *ipiv, BLASLONG offset, int (*function)(void), void *buffer); #endif /* ENDIF ASSEMBLER */ diff --git a/cpuid_x86.c b/cpuid_x86.c index c485f3ddf..fdcead8bd 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -194,7 +194,7 @@ static C_INLINE void xgetbv(int op, int * eax, int * edx){ } #endif -int support_avx(){ +int support_avx(void){ #ifndef NO_AVX int eax, ebx, ecx, edx; int ret=0; @@ -212,7 +212,7 @@ int support_avx(){ #endif } -int support_avx2(){ +int support_avx2(void){ #ifndef NO_AVX2 int eax, ebx, ecx=0, edx; int ret=0; @@ -228,7 +228,7 @@ int support_avx2(){ #endif } -int support_avx512(){ +int support_avx512(void){ #if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; @@ -250,7 +250,7 @@ int support_avx512(){ #endif } -int support_avx512_bf16(){ +int support_avx512_bf16(void){ #if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; @@ -271,7 +271,7 @@ int support_avx512_bf16(){ #define BIT_AMX_BF16 0x00400000 #define BIT_AMX_ENBD 0x00060000 -int support_amx_bf16() { +int support_amx_bf16(void) { #if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; From c4bd4a2e5dbbb648ac8198f6193657ea403d088b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 12:49:24 +0200 Subject: [PATCH 316/718] fix improper function prototypes (empty parentheses) --- driver/level3/gemm_thread_m.c | 2 +- driver/level3/gemm_thread_mn.c | 2 +- driver/level3/gemm_thread_n.c | 2 +- driver/level3/gemm_thread_variable.c | 2 +- driver/level3/syrk_thread.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/driver/level3/gemm_thread_m.c b/driver/level3/gemm_thread_m.c index 8813e5529..353ae0be9 100644 --- a/driver/level3/gemm_thread_m.c +++ b/driver/level3/gemm_thread_m.c @@ -40,7 +40,7 @@ #include #include "common.h" -int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c index 6b52df884..4f370999a 100644 --- a/driver/level3/gemm_thread_mn.c +++ b/driver/level3/gemm_thread_mn.c @@ -60,7 +60,7 @@ static const int divide_rule[][2] = { 1, 61}, { 2, 31}, { 7, 9}, { 8, 8}, }; -int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index 9668841bb..d583456bd 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -40,7 +40,7 @@ #include #include "common.h" -int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c index 162a75f70..75e49cb1a 100644 --- a/driver/level3/gemm_thread_variable.c +++ b/driver/level3/gemm_thread_variable.c @@ -42,7 +42,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, - int (*function)(), void *sa, void *sb, BLASLONG divM, BLASLONG divN) { + int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG divM, BLASLONG divN) { blas_queue_t queue[MAX_CPU_NUMBER]; diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 12808afd5..a40122e38 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -41,7 +41,7 @@ #include #include "common.h" -int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*, FLOAT *, FLOAT *, BLASLONG), void *sa, void *sb, BLASLONG nthreads) { blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; From c6b1d8e7a31f96b6e17fdd92fb6dbbbb2ef7b562 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 12:52:06 +0200 Subject: [PATCH 317/718] fix improper function prototypes (empty parentheses) --- driver/others/blas_l1_thread.c | 4 ++-- driver/others/blas_server.c | 2 +- driver/others/memory.c | 10 +++++----- driver/others/openblas_env.c | 18 +++++++++--------- driver/others/openblas_error_handle.c | 2 +- driver/others/openblas_get_config.c | 8 ++++---- driver/others/openblas_get_parallel.c | 6 +++--- driver/others/parameter.c | 2 +- 8 files changed, 26 insertions(+), 26 deletions(-) diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c index 06039c952..01b254f5d 100644 --- a/driver/others/blas_l1_thread.c +++ b/driver/others/blas_l1_thread.c @@ -43,7 +43,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads){ + void *c, BLASLONG ldc, int (*function)(void), int nthreads){ blas_queue_t queue[MAX_CPU_NUMBER]; blas_arg_t args [MAX_CPU_NUMBER]; @@ -141,7 +141,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads){ + void *c, BLASLONG ldc, int (*function)(void), int nthreads){ blas_queue_t queue[MAX_CPU_NUMBER]; blas_arg_t args [MAX_CPU_NUMBER]; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index a8a84acbb..2fcb37192 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -93,7 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -extern unsigned int openblas_thread_timeout(); +extern unsigned int openblas_thread_timeout(void); #ifdef SMP_SERVER diff --git a/driver/others/memory.c b/driver/others/memory.c index b27fec431..a74cdabd4 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1999,7 +1999,7 @@ int goto_get_num_procs (void) { return blas_cpu_number; } -void openblas_fork_handler() +void openblas_fork_handler(void) { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is // built with "make USE_OPENMP=0". @@ -2016,9 +2016,9 @@ void openblas_fork_handler() #endif } -extern int openblas_num_threads_env(); -extern int openblas_goto_num_threads_env(); -extern int openblas_omp_num_threads_env(); +extern int openblas_num_threads_env(void); +extern int openblas_goto_num_threads_env(void); +extern int openblas_omp_num_threads_env(void); int blas_get_cpu_number(void){ #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) @@ -3339,7 +3339,7 @@ static void gotoblas_memory_init(void) { /* Initialization for all function; this function should be called before main */ static int gotoblas_initialized = 0; -extern void openblas_read_env(); +extern void openblas_read_env(void); void CONSTRUCTOR gotoblas_init(void) { diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c index 35b2270d4..c65f0f320 100644 --- a/driver/others/openblas_env.c +++ b/driver/others/openblas_env.c @@ -41,15 +41,15 @@ static int openblas_env_goto_num_threads=0; static int openblas_env_omp_num_threads=0; static int openblas_env_omp_adaptive=0; -int openblas_verbose() { return openblas_env_verbose;} -unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;} -int openblas_block_factor() { return openblas_env_block_factor;} -int openblas_num_threads_env() { return openblas_env_openblas_num_threads;} -int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;} -int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;} -int openblas_omp_adaptive_env() { return openblas_env_omp_adaptive;} - -void openblas_read_env() { +int openblas_verbose(void) { return openblas_env_verbose;} +unsigned int openblas_thread_timeout(void) { return openblas_env_thread_timeout;} +int openblas_block_factor(void) { return openblas_env_block_factor;} +int openblas_num_threads_env(void) { return openblas_env_openblas_num_threads;} +int openblas_goto_num_threads_env(void) { return openblas_env_goto_num_threads;} +int openblas_omp_num_threads_env(void) { return openblas_env_omp_num_threads;} +int openblas_omp_adaptive_env(void) { return openblas_env_omp_adaptive;} + +void openblas_read_env(void) { int ret=0; env_var_t p; if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); diff --git a/driver/others/openblas_error_handle.c b/driver/others/openblas_error_handle.c index 9ac72c15d..aa0aa776a 100644 --- a/driver/others/openblas_error_handle.c +++ b/driver/others/openblas_error_handle.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -extern int openblas_verbose(); +extern int openblas_verbose(void); void openblas_warning(int verbose, const char * msg) { int current_verbose; diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 7a5cbeb62..867d0e361 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -69,13 +69,13 @@ static char* openblas_config_str="" ; #ifdef DYNAMIC_ARCH -char *gotoblas_corename(); +char *gotoblas_corename(void); #endif static char tmp_config_str[256]; -int openblas_get_parallel(); +int openblas_get_parallel(void); -char* CNAME() { +char* CNAME(void) { char tmpstr[20]; strcpy(tmp_config_str, openblas_config_str); #ifdef DYNAMIC_ARCH @@ -90,7 +90,7 @@ char tmpstr[20]; } -char* openblas_get_corename() { +char* openblas_get_corename(void) { #ifndef DYNAMIC_ARCH return CHAR_CORENAME; #else diff --git a/driver/others/openblas_get_parallel.c b/driver/others/openblas_get_parallel.c index 5dfda6e59..becfa0a3a 100644 --- a/driver/others/openblas_get_parallel.c +++ b/driver/others/openblas_get_parallel.c @@ -42,17 +42,17 @@ static int parallel = 0; #ifdef NEEDBUNDERSCORE -int CNAME() { +int CNAME(void) { return parallel; } -int NAME() { +int NAME(void) { return parallel; } #else //The CNAME and NAME are the same. -int NAME() { +int NAME(void) { return parallel; } #endif diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 0d5c6aec0..de6bf0de4 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -40,7 +40,7 @@ #include #include "common.h" -extern int openblas_block_factor(); +extern int openblas_block_factor(void); int get_L2_size(void); #define DEFAULT_GEMM_P 128 From 13ba4edf4373c324bc46a97ec6e96764d44fb873 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 12:53:35 +0200 Subject: [PATCH 318/718] fix function prototypes (empty parentheses) --- interface/lapack/laswp.c | 2 +- interface/lapack/zlaswp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/lapack/laswp.c b/interface/lapack/laswp.c index 0dde33ae3..6544dbc5b 100644 --- a/interface/lapack/laswp.c +++ b/interface/lapack/laswp.c @@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, - (int(*)())laswp[flag], nthreads); + (int(*)(void))laswp[flag], nthreads); } #endif diff --git a/interface/lapack/zlaswp.c b/interface/lapack/zlaswp.c index b77a40985..7bb4a659e 100644 --- a/interface/lapack/zlaswp.c +++ b/interface/lapack/zlaswp.c @@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads); + blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)(void))laswp[flag], nthreads); } #endif From 675cd551da315af964b6b097e6e5ab7b35bd6e59 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 12:56:38 +0200 Subject: [PATCH 319/718] fix improper function prototypes (empty parentheses) --- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/drot.c | 2 +- kernel/x86_64/srot.c | 2 +- kernel/x86_64/zdot.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index f3b9ee701..569ed2416 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -159,7 +159,7 @@ static int dot_thread_function(BLASLONG n, BLASLONG dummy0, extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads); + void *c, BLASLONG ldc, int (*function)(void), int nthreads); #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c index 40c9cf19d..6fdf4ae56 100644 --- a/kernel/x86_64/drot.c +++ b/kernel/x86_64/drot.c @@ -169,7 +169,7 @@ static int rot_thread_function(blas_arg_t *args) return 0; } -extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(void), int nthreads); #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index a49544616..05724b427 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -171,7 +171,7 @@ static int rot_thread_function(blas_arg_t *args) return 0; } -extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(void), int nthreads); #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) { diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 72a712a9e..51efa2dfe 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -92,7 +92,7 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #if defined(SMP) extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads); + void *c, BLASLONG ldc, int (*function)(void), int nthreads); #endif From cd8ac192a901b38980755583faaa35559df7910a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 12:58:30 +0200 Subject: [PATCH 320/718] fix improper function prototypes (empty parentheses) --- lapack-netlib/LAPACKE/src/lapacke_nancheck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_nancheck.c b/lapack-netlib/LAPACKE/src/lapacke_nancheck.c index c7d5c33f1..bb894f351 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_nancheck.c +++ b/lapack-netlib/LAPACKE/src/lapacke_nancheck.c @@ -39,7 +39,7 @@ void LAPACKE_set_nancheck( int flag ) nancheck_flag = ( flag ) ? 1 : 0; } -int LAPACKE_get_nancheck( ) +int LAPACKE_get_nancheck( void ) { char* env; if ( nancheck_flag != -1 ) { From f4f31fb53b5f4069ae19a1840035bb770e237945 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 12:59:44 +0200 Subject: [PATCH 321/718] fix improper function prototypes (empty parentheses) --- lapack/lauum/lauum_L_parallel.c | 4 ++-- lapack/lauum/lauum_U_parallel.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c index 1b32e4519..0f4eaefaa 100644 --- a/lapack/lauum/lauum_L_parallel.c +++ b/lapack/lauum/lauum_L_parallel.c @@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = a; syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO, - &newarg, NULL, NULL, (int (*)(void))HERK_LC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_LC, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = i; @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i ) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (int (*)(void))TRMM_LCLN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG))TRMM_LCLN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c index f5ea54c88..77bfeebc7 100644 --- a/lapack/lauum/lauum_U_parallel.c +++ b/lapack/lauum/lauum_U_parallel.c @@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = a; syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (int (*)(void))HERK_UN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_UN, sa, sb, args -> nthreads); newarg.m = i; newarg.n = bk; @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + ( i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE, - &newarg, NULL, NULL, (int (*)(void))TRMM_RCUN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG))TRMM_RCUN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; From 1d4aa8d7d52f469724c26c1378ddf9cca778ce99 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 13:00:51 +0200 Subject: [PATCH 322/718] fix improper function prototypes (empty parentheses) --- lapack/potrf/potrf_L_parallel.c | 2 +- lapack/potrf/potrf_U_parallel.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c index 986816d1a..7d6bcd776 100644 --- a/lapack/potrf/potrf_L_parallel.c +++ b/lapack/potrf/potrf_L_parallel.c @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i + bk + i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, - &newarg, NULL, NULL, (int (*)(void))TRSM_RCLN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))TRSM_RCLN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c index cc6ff9912..1f1427276 100644 --- a/lapack/potrf/potrf_U_parallel.c +++ b/lapack/potrf/potrf_U_parallel.c @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (int (*)(void))TRSM_LCUN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))TRSM_LCUN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; From 60ff5872af834658dbc2cf5d72f375c37052032e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 13:01:44 +0200 Subject: [PATCH 323/718] fix improper function prototypes (empty parentheses) --- utest/ctest.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/utest/ctest.h b/utest/ctest.h index 79961badf..b158b4538 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -41,7 +41,7 @@ typedef void (*RunWithDataFunc)(void*); struct ctest { const char* ssname; // suite name const char* ttname; // test name - void (*run)(); + void (*run)(void); int skip; void* data; @@ -159,9 +159,9 @@ struct ctest { void WEAK sname##_teardown(struct sname##_data* data) #define __CTEST_INTERNAL(sname, tname, _skip) \ - void __FNAME(sname, tname)(); \ + void __FNAME(sname, tname)(void); \ __CTEST_STRUCT(sname, tname, _skip, NULL, NULL, NULL) \ - void __FNAME(sname, tname)() + void __FNAME(sname, tname)(void) #ifdef __CTEST_APPLE #define SETUP_FNAME(sname) NULL @@ -366,7 +366,7 @@ void __ctest_addTest(struct ctest *test) #ifndef __CTEST_MSVC /* Add all tests to linked list automatically. */ -static void __ctest_linkTests() +static void __ctest_linkTests(void) { struct ctest ** test; struct ctest ** ctest_begin = (struct ctest **)__PNAME(suite, test); @@ -401,7 +401,7 @@ static void __ctest_linkTests() __ctest_head_p = ctest_begin; } #else //for msvc -static void __ctest_linkTests() +static void __ctest_linkTests(void) { struct ctest ** ctest_start = __ctest_head_p; struct ctest ** test; @@ -450,7 +450,7 @@ static void msg_start(const char* color, const char* title) { print_errormsg(" %s: ", title); } -static void msg_end() { +static void msg_end(void) { if (color_output) { print_errormsg(ANSI_NORMAL); } @@ -634,7 +634,7 @@ static int suite_test_filter(struct ctest* t) { #ifndef __CTEST_NO_TIME -static uint64_t getCurrentTime() { +static uint64_t getCurrentTime(void) { struct timeval now; gettimeofday(&now, NULL); uint64_t now64 = (uint64_t) now.tv_sec; From 2dba455d2e950da2ea40cd9c6696294419b2a538 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 15:33:55 +0200 Subject: [PATCH 324/718] revert accidental changes --- common_thread.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common_thread.h b/common_thread.h index 70c724597..6e18d2a8e 100644 --- a/common_thread.h +++ b/common_thread.h @@ -141,14 +141,14 @@ static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP int openmp_nthreads; - openmp_nthreads=omp_get_max_threads(void); + openmp_nthreads=omp_get_max_threads(); #endif #ifndef USE_OPENMP if (blas_cpu_number == 1 #endif #ifdef USE_OPENMP - if (openmp_nthreads == 1 || omp_in_parallel(void) + if (openmp_nthreads == 1 || omp_in_parallel() #endif ) return 1; From cf2174fb6967db717ec5ec640f3a88d0ccf0372e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 17:04:39 +0200 Subject: [PATCH 325/718] fix improper function prototypes (empty parentheses) --- driver/others/blas_server_omp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index fe6b4a7c0..3e2179373 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -70,7 +70,7 @@ int blas_server_avail = 0; int blas_omp_number_max = 0; -extern int openblas_omp_adaptive_env(); +extern int openblas_omp_adaptive_env(void); static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; #ifdef HAVE_C11 @@ -79,7 +79,7 @@ static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; #endif -static void adjust_thread_buffers() { +static void adjust_thread_buffers(void) { int i=0, j=0; @@ -124,9 +124,9 @@ void openblas_set_num_threads(int num_threads) { } int blas_thread_init(void){ -if(blas_omp_number_max <= 0) - blas_omp_number_max = omp_get_max_threads(); - + if(blas_omp_number_max <= 0) + blas_omp_number_max = omp_get_max_threads(); + blas_get_cpu_number(); adjust_thread_buffers(); From 90f890ee675945cdb7d7d9887e4baf50c7d5bb29 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Sep 2023 23:12:36 +0200 Subject: [PATCH 326/718] fix improper function prototypes (empty parentheses) (USE_TLS branch) --- driver/others/memory.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index a74cdabd4..caef3e2b7 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -427,9 +427,9 @@ int goto_get_num_procs (void) { return blas_cpu_number; } -static void blas_memory_init(); +static void blas_memory_init(void); -void openblas_fork_handler() +void openblas_fork_handler(void) { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is // built with "make USE_OPENMP=0". @@ -446,9 +446,9 @@ void openblas_fork_handler() #endif } -extern int openblas_num_threads_env(); -extern int openblas_goto_num_threads_env(); -extern int openblas_omp_num_threads_env(); +extern int openblas_num_threads_env(void); +extern int openblas_goto_num_threads_env(void); +extern int openblas_omp_num_threads_env(void); int blas_get_cpu_number(void){ #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) @@ -592,7 +592,7 @@ static BLASULONG key_lock = 0UL; #endif /* Returns a pointer to the start of the per-thread memory allocation data */ -static __inline struct alloc_t ** get_memory_table() { +static __inline struct alloc_t ** get_memory_table(void) { #if defined(SMP) LOCK_COMMAND(&key_lock); lsk=local_storage_key; @@ -1145,7 +1145,7 @@ static void blas_memory_cleanup(void* ptr){ } } -static void blas_memory_init(){ +static void blas_memory_init(void){ #if defined(SMP) # if defined(OS_WINDOWS) local_storage_key = TlsAlloc(); @@ -1502,7 +1502,7 @@ static void gotoblas_memory_init(void) { /* Initialization for all function; this function should be called before main */ static int gotoblas_initialized = 0; -extern void openblas_read_env(); +extern void openblas_read_env(void); void CONSTRUCTOR gotoblas_init(void) { From 3b1150fcee164922ed932c7d46b28a8ffec744a8 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 2 Oct 2023 12:00:48 -0500 Subject: [PATCH 327/718] Fix CPU identification to work on AIX. --- driver/others/dynamic_power.c | 216 +++++++++------------------------- 1 file changed, 58 insertions(+), 158 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 2847ea9ae..7f8bfd5b9 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -20,12 +20,10 @@ static char *corename[] = { "POWER10" }; -#define NUM_CORETYPES 4 +#define NUM_CORETYPES 5 char *gotoblas_corename(void) { -#ifndef C_PGI if (gotoblas == &gotoblas_POWER6) return corename[1]; -#endif if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; @@ -36,177 +34,81 @@ char *gotoblas_corename(void) { return corename[0]; } -#if defined(__clang__) -static int __builtin_cpu_supports(char* arg) -{ - return 0; -} -#endif - -#if defined(C_PGI) || defined(__clang__) -/* - * NV HPC compilers do not yet implement __builtin_cpu_is(). - * Fake a version here for use in the CPU detection code below. - * - * Strategy here is to first check the CPU to see what it actually is, - * and then test the input to see if what the CPU actually is matches - * what was requested. - */ +#ifdef _AIX +#include -#include - -/* - * Define POWER processor version table. - * - * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time - */ - -#define CPU_UNKNOWN 0 -#define CPU_POWER5 5 -#define CPU_POWER6 6 -#define CPU_POWER8 8 -#define CPU_POWER9 9 +#define CPU_UNKNOWN 0 +#define CPU_POWER6 6 +#define CPU_POWER7 7 +#define CPU_POWER8 8 +#define CPU_POWER9 9 #define CPU_POWER10 10 -static struct { - uint32_t pvr_mask; - uint32_t pvr_value; - const char* cpu_name; - uint32_t cpu_type; -} pvrPOWER [] = { - - { /* POWER6 in P5+ mode; 2.04-compliant processor */ - .pvr_mask = 0xffffffff, - .pvr_value = 0x0f000001, - .cpu_name = "POWER5+", - .cpu_type = CPU_POWER5, - }, - - { /* Power6 aka POWER6X*/ - .pvr_mask = 0xffff0000, - .pvr_value = 0x003e0000, - .cpu_name = "POWER6 (raw)", - .cpu_type = CPU_POWER6, - }, - - { /* Power7 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x003f0000, - .cpu_name = "POWER7 (raw)", - .cpu_type = CPU_POWER6, - }, - - { /* Power7+ */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x004A0000, - .cpu_name = "POWER7+ (raw)", - .cpu_type = CPU_POWER6, - }, - - { /* Power8E */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x004b0000, - .cpu_name = "POWER8E (raw)", - .cpu_type = CPU_POWER8, - }, - - { /* Power8NVL */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x004c0000, - .cpu_name = "POWER8NVL (raw)", - .cpu_type = CPU_POWER8, - }, - - { /* Power8 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x004d0000, - .cpu_name = "POWER8 (raw)", - .cpu_type = CPU_POWER8, - }, - - { /* Power9 DD2.0 */ - .pvr_mask = 0xffffefff, - .pvr_value = 0x004e0200, - .cpu_name = "POWER9 (raw)", - .cpu_type = CPU_POWER9, - }, - - { /* Power9 DD 2.1 */ - .pvr_mask = 0xffffefff, - .pvr_value = 0x004e0201, - .cpu_name = "POWER9 (raw)", - .cpu_type = CPU_POWER9, - }, - - { /* Power9 DD2.2 or later */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x004e0000, - .cpu_name = "POWER9 (raw)", - .cpu_type = CPU_POWER9, - }, - - { /* Power10 */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00800000, - .cpu_name = "POWER10 (raw)", - .cpu_type = CPU_POWER10, - }, - - { /* End of table, pvr_mask and pvr_value must be zero */ - .pvr_mask = 0x0, - .pvr_value = 0x0, - .cpu_name = "Unknown", - .cpu_type = CPU_UNKNOWN, - }, -}; - -static int __builtin_cpu_is(const char *cpu) { - int i; - uint32_t pvr; - uint32_t cpu_type; - - asm("mfpvr %0" : "=r"(pvr)); - - for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { - if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { - break; - } - } - -#if defined(DEBUG) - printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, - pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); +int cpuid() +{ + int arch = _system_configuration.implementation; +#ifdef POWER_6 + if (arch == POWER_6) return CPU_POWER6; #endif - cpu_type = pvrPOWER[i].cpu_type; - - if (!strcmp(cpu, "power8")) - return cpu_type == CPU_POWER8; - if (!strcmp(cpu, "power9")) - return cpu_type == CPU_POWER9; - return 0; +#ifdef POWER_7 + else if (arch == POWER_7) return CPU_POWER7; +#endif +#ifdef POWER_8 + else if (arch == POWER_8) return CPU_POWER8; +#endif +#ifdef POWER_9 + else if (arch == POWER_9) return CPU_POWER9; +#endif +#ifdef POWER_10 + else if (arch == POWER_10) return CPU_POWER10; +#endif + return CPU_UNKNOWN; } -#endif /* C_PGI */ +#ifndef __BUILTIN_CPU_SUPPORTS__ +static int __builtin_cpu_supports(char* arg) +{ + static int ipinfo = -1; + if (ipinfo < 0) { + ipinfo = cpuid(); + } + if (ipinfo >= CPU_POWER10) { + if (!strcmp(arg, "power10")) return 1; + } + if (ipinfo >= CPU_POWER9) { + if (!strcmp(arg, "power9")) return 1; + } + if (ipinfo >= CPU_POWER8) { + if (!strcmp(arg, "power8")) return 1; + } + if (ipinfo >= CPU_POWER6) { + if (!strcmp(arg, "power6")) return 1; + } + return 0; +} +#endif static gotoblas_t *get_coretype(void) { -#ifndef C_PGI - if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) + if (__builtin_cpu_supports("power6")) return &gotoblas_POWER6; -#endif - if (__builtin_cpu_is("power8")) + if (__builtin_cpu_supports("power8")) return &gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) - if (__builtin_cpu_is("power9")) + if (__builtin_cpu_supports("power9")) return &gotoblas_POWER9; #endif #ifdef HAVE_P10_SUPPORT - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) +#ifdef _AIX + if (__builtin_cpu_supports("power10")) +#else + if (__builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma")) +#endif return &gotoblas_POWER10; #endif /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ -#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) - if (__builtin_cpu_is("power10")) +#if (!defined __GNUC__) || ( __GNUC__ < 11) || (__GNUC__ == 10 && __GNUC_MINOR__ < 2) + if (__builtin_cpu_supports("power10")) return &gotoblas_POWER9; #endif return NULL; @@ -229,9 +131,7 @@ static gotoblas_t *force_coretype(char * coretype) { switch (found) { -#ifndef C_PGI case 1: return (&gotoblas_POWER6); -#endif case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); From eb738d99293dc658bd6941cc6c2b76cd6ece0c11 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 2 Oct 2023 12:14:46 -0500 Subject: [PATCH 328/718] Minor changes. --- Makefile.system | 2 -- driver/others/dynamic_power.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 1fd47e68e..b1a357fdf 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1170,8 +1170,6 @@ CCOMMON_OPT += -DF_INTERFACE_IBM FEXTRALIB += -lxlf90 ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG)) FCOMMON_OPT += -qextname -else ifeq ($(C_COMPILER), CLANG) -FCOMMON_OPT += -qextname endif # FCOMMON_OPT += -qarch=440 ifdef BINARY64 diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 7f8bfd5b9..1d3f36875 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -44,7 +44,7 @@ char *gotoblas_corename(void) { #define CPU_POWER9 9 #define CPU_POWER10 10 -int cpuid() +static int cpuid(void) { int arch = _system_configuration.implementation; #ifdef POWER_6 From 12130ee9613936f2fa49fd58a7f6bf8210a65552 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 2 Oct 2023 12:19:22 -0500 Subject: [PATCH 329/718] Remove tab. --- driver/others/dynamic_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 1d3f36875..3c5f1f3c1 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -102,7 +102,7 @@ static gotoblas_t *get_coretype(void) { #ifdef _AIX if (__builtin_cpu_supports("power10")) #else - if (__builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma")) + if (__builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma")) #endif return &gotoblas_POWER10; #endif From a922a07e610e0508e2f2f84ae158c46e2e3d7a0e Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 2 Oct 2023 12:24:30 -0500 Subject: [PATCH 330/718] Cleanup white spaces. --- driver/others/dynamic_power.c | 158 +++++++++++++++++----------------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 3c5f1f3c1..40f00a634 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -13,25 +13,25 @@ extern gotoblas_t gotoblas_POWER10; extern void openblas_warning(int verbose, const char *msg); static char *corename[] = { - "unknown", - "POWER6", - "POWER8", - "POWER9", - "POWER10" + "unknown", + "POWER6", + "POWER8", + "POWER9", + "POWER10" }; #define NUM_CORETYPES 5 char *gotoblas_corename(void) { - if (gotoblas == &gotoblas_POWER6) return corename[1]; - if (gotoblas == &gotoblas_POWER8) return corename[2]; + if (gotoblas == &gotoblas_POWER6) return corename[1]; + if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) - if (gotoblas == &gotoblas_POWER9) return corename[3]; + if (gotoblas == &gotoblas_POWER9) return corename[3]; #endif #ifdef HAVE_P10_SUPPORT - if (gotoblas == &gotoblas_POWER10) return corename[4]; + if (gotoblas == &gotoblas_POWER10) return corename[4]; #endif - return corename[0]; + return corename[0]; } #ifdef _AIX @@ -90,13 +90,13 @@ static int __builtin_cpu_supports(char* arg) static gotoblas_t *get_coretype(void) { - if (__builtin_cpu_supports("power6")) - return &gotoblas_POWER6; - if (__builtin_cpu_supports("power8")) - return &gotoblas_POWER8; + if (__builtin_cpu_supports("power6")) + return &gotoblas_POWER6; + if (__builtin_cpu_supports("power8")) + return &gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) - if (__builtin_cpu_supports("power9")) - return &gotoblas_POWER9; + if (__builtin_cpu_supports("power9")) + return &gotoblas_POWER9; #endif #ifdef HAVE_P10_SUPPORT #ifdef _AIX @@ -104,84 +104,84 @@ static gotoblas_t *get_coretype(void) { #else if (__builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma")) #endif - return &gotoblas_POWER10; + return &gotoblas_POWER10; #endif - /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ + /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ #if (!defined __GNUC__) || ( __GNUC__ < 11) || (__GNUC__ == 10 && __GNUC_MINOR__ < 2) - if (__builtin_cpu_supports("power10")) - return &gotoblas_POWER9; -#endif - return NULL; + if (__builtin_cpu_supports("power10")) + return &gotoblas_POWER9; +#endif + return NULL; } static gotoblas_t *force_coretype(char * coretype) { - int i ; - int found = -1; - char message[128]; - - for ( i = 0 ; i < NUM_CORETYPES; i++) - { - if (!strncasecmp(coretype, corename[i], 20)) - { - found = i; - break; - } - } - - switch (found) - { - case 1: return (&gotoblas_POWER6); - case 2: return (&gotoblas_POWER8); + int i ; + int found = -1; + char message[128]; + + for ( i = 0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 1: return (&gotoblas_POWER6); + case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) - case 3: return (&gotoblas_POWER9); + case 3: return (&gotoblas_POWER9); #endif #ifdef HAVE_P10_SUPPORT - case 4: return (&gotoblas_POWER10); + case 4: return (&gotoblas_POWER10); #endif - default: return NULL; - } - snprintf(message, 128, "Core not found: %s\n", coretype); - openblas_warning(1, message); + default: return NULL; + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); } void gotoblas_dynamic_init(void) { - char coremsg[128]; - char coren[22]; - char *p; - - - if (gotoblas) return; - - p = getenv("OPENBLAS_CORETYPE"); - if ( p ) - { - gotoblas = force_coretype(p); - } - else - { - gotoblas = get_coretype(); - } - - if (gotoblas == NULL) - { - snprintf(coremsg, 128, "Falling back to POWER8 core\n"); - openblas_warning(1, coremsg); - gotoblas = &gotoblas_POWER8; - } - - if (gotoblas && gotoblas -> init) { - strncpy(coren,gotoblas_corename(),20); - sprintf(coremsg, "Core: %s\n",coren); - openblas_warning(2, coremsg); - gotoblas -> init(); - } else { - openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); - exit(1); - } + char coremsg[128]; + char coren[22]; + char *p; + + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to POWER8 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_POWER8; + } + + if (gotoblas && gotoblas -> init) { + strncpy(coren,gotoblas_corename(),20); + sprintf(coremsg, "Core: %s\n",coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } } void gotoblas_dynamic_quit(void) { - gotoblas = NULL; + gotoblas = NULL; } From 10210748de17a217fd67f6cb8501272b8bfa88c2 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 2 Oct 2023 12:44:07 -0500 Subject: [PATCH 331/718] Revert PGI changes. --- driver/others/dynamic_power.c | 310 +++++++++++++++++++++++++--------- 1 file changed, 234 insertions(+), 76 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 40f00a634..0f5b06be5 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -13,27 +13,181 @@ extern gotoblas_t gotoblas_POWER10; extern void openblas_warning(int verbose, const char *msg); static char *corename[] = { - "unknown", - "POWER6", - "POWER8", - "POWER9", - "POWER10" + "unknown", + "POWER6", + "POWER8", + "POWER9", + "POWER10" }; #define NUM_CORETYPES 5 char *gotoblas_corename(void) { - if (gotoblas == &gotoblas_POWER6) return corename[1]; - if (gotoblas == &gotoblas_POWER8) return corename[2]; +#ifndef C_PGI + if (gotoblas == &gotoblas_POWER6) return corename[1]; +#endif + if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) - if (gotoblas == &gotoblas_POWER9) return corename[3]; + if (gotoblas == &gotoblas_POWER9) return corename[3]; #endif #ifdef HAVE_P10_SUPPORT - if (gotoblas == &gotoblas_POWER10) return corename[4]; + if (gotoblas == &gotoblas_POWER10) return corename[4]; +#endif + return corename[0]; +} + +#if defined(__clang__) +static int __builtin_cpu_supports(char* arg) +{ + return 0; +} +#endif + +#if defined(C_PGI) || defined(__clang__) +/* + * NV HPC compilers do not yet implement __builtin_cpu_is(). + * Fake a version here for use in the CPU detection code below. + * + * Strategy here is to first check the CPU to see what it actually is, + * and then test the input to see if what the CPU actually is matches + * what was requested. + */ + +#include + +/* + * Define POWER processor version table. + * + * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time + */ + +#define CPU_UNKNOWN 0 +#define CPU_POWER5 5 +#define CPU_POWER6 6 +#define CPU_POWER8 8 +#define CPU_POWER9 9 +#define CPU_POWER10 10 + +static struct { + uint32_t pvr_mask; + uint32_t pvr_value; + const char* cpu_name; + uint32_t cpu_type; +} pvrPOWER [] = { + + { /* POWER6 in P5+ mode; 2.04-compliant processor */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000001, + .cpu_name = "POWER5+", + .cpu_type = CPU_POWER5, + }, + + { /* Power6 aka POWER6X*/ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003e0000, + .cpu_name = "POWER6 (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power7 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003f0000, + .cpu_name = "POWER7 (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power7+ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004A0000, + .cpu_name = "POWER7+ (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power8E */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004b0000, + .cpu_name = "POWER8E (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power8NVL */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004c0000, + .cpu_name = "POWER8NVL (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power8 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004d0000, + .cpu_name = "POWER8 (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power9 DD 2.1 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0201, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power9 DD2.2 or later */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004e0000, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power10 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00800000, + .cpu_name = "POWER10 (raw)", + .cpu_type = CPU_POWER10, + }, + + { /* End of table, pvr_mask and pvr_value must be zero */ + .pvr_mask = 0x0, + .pvr_value = 0x0, + .cpu_name = "Unknown", + .cpu_type = CPU_UNKNOWN, + }, +}; + +static int __builtin_cpu_is(const char *cpu) { + int i; + uint32_t pvr; + uint32_t cpu_type; + + asm("mfpvr %0" : "=r"(pvr)); + + for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { + if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { + break; + } + } + +#if defined(DEBUG) + printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, + pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); #endif - return corename[0]; + cpu_type = pvrPOWER[i].cpu_type; + + if (!strcmp(cpu, "power8")) + return cpu_type == CPU_POWER8; + if (!strcmp(cpu, "power9")) + return cpu_type == CPU_POWER9; + return 0; } +#endif /* C_PGI */ + #ifdef _AIX #include @@ -90,98 +244,102 @@ static int __builtin_cpu_supports(char* arg) static gotoblas_t *get_coretype(void) { - if (__builtin_cpu_supports("power6")) - return &gotoblas_POWER6; - if (__builtin_cpu_supports("power8")) - return &gotoblas_POWER8; +#ifndef C_PGI + if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) + return &gotoblas_POWER6; +#endif + if (__builtin_cpu_is("power8")) + return &gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) - if (__builtin_cpu_supports("power9")) - return &gotoblas_POWER9; + if (__builtin_cpu_is("power9")) + return &gotoblas_POWER9; #endif #ifdef HAVE_P10_SUPPORT #ifdef _AIX - if (__builtin_cpu_supports("power10")) + if (__builtin_cpu_supports("power10")) #else - if (__builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma")) -#endif - return &gotoblas_POWER10; + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) #endif - /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ -#if (!defined __GNUC__) || ( __GNUC__ < 11) || (__GNUC__ == 10 && __GNUC_MINOR__ < 2) - if (__builtin_cpu_supports("power10")) - return &gotoblas_POWER9; + return &gotoblas_POWER10; #endif - return NULL; + /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ +#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) + if (__builtin_cpu_is("power10")) + return &gotoblas_POWER9; +#endif + return NULL; } static gotoblas_t *force_coretype(char * coretype) { - int i ; - int found = -1; - char message[128]; - - for ( i = 0 ; i < NUM_CORETYPES; i++) - { - if (!strncasecmp(coretype, corename[i], 20)) - { - found = i; - break; - } - } + int i ; + int found = -1; + char message[128]; - switch (found) - { - case 1: return (&gotoblas_POWER6); - case 2: return (&gotoblas_POWER8); + for ( i = 0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { +#ifndef C_PGI + case 1: return (&gotoblas_POWER6); +#endif + case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) - case 3: return (&gotoblas_POWER9); + case 3: return (&gotoblas_POWER9); #endif #ifdef HAVE_P10_SUPPORT - case 4: return (&gotoblas_POWER10); + case 4: return (&gotoblas_POWER10); #endif - default: return NULL; - } - snprintf(message, 128, "Core not found: %s\n", coretype); - openblas_warning(1, message); + default: return NULL; + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); } void gotoblas_dynamic_init(void) { - char coremsg[128]; - char coren[22]; - char *p; + char coremsg[128]; + char coren[22]; + char *p; - if (gotoblas) return; + if (gotoblas) return; - p = getenv("OPENBLAS_CORETYPE"); - if ( p ) - { - gotoblas = force_coretype(p); - } - else - { - gotoblas = get_coretype(); - } + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } - if (gotoblas == NULL) - { - snprintf(coremsg, 128, "Falling back to POWER8 core\n"); - openblas_warning(1, coremsg); - gotoblas = &gotoblas_POWER8; - } + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to POWER8 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_POWER8; + } - if (gotoblas && gotoblas -> init) { - strncpy(coren,gotoblas_corename(),20); - sprintf(coremsg, "Core: %s\n",coren); - openblas_warning(2, coremsg); - gotoblas -> init(); - } else { - openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); - exit(1); - } + if (gotoblas && gotoblas -> init) { + strncpy(coren,gotoblas_corename(),20); + sprintf(coremsg, "Core: %s\n",coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } } void gotoblas_dynamic_quit(void) { - gotoblas = NULL; + gotoblas = NULL; } From e5dc376912dab278afdf677cb112008d36ead0fe Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 2 Oct 2023 12:48:47 -0500 Subject: [PATCH 332/718] Remove duplicate defines. --- driver/others/dynamic_power.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 0f5b06be5..7b0b4ea01 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -43,6 +43,13 @@ static int __builtin_cpu_supports(char* arg) } #endif +#define CPU_UNKNOWN 0 +#define CPU_POWER6 6 +#define CPU_POWER7 7 +#define CPU_POWER8 8 +#define CPU_POWER9 9 +#define CPU_POWER10 10 + #if defined(C_PGI) || defined(__clang__) /* * NV HPC compilers do not yet implement __builtin_cpu_is(). @@ -61,13 +68,6 @@ static int __builtin_cpu_supports(char* arg) * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time */ -#define CPU_UNKNOWN 0 -#define CPU_POWER5 5 -#define CPU_POWER6 6 -#define CPU_POWER8 8 -#define CPU_POWER9 9 -#define CPU_POWER10 10 - static struct { uint32_t pvr_mask; uint32_t pvr_value; @@ -191,13 +191,6 @@ static int __builtin_cpu_is(const char *cpu) { #ifdef _AIX #include -#define CPU_UNKNOWN 0 -#define CPU_POWER6 6 -#define CPU_POWER7 7 -#define CPU_POWER8 8 -#define CPU_POWER9 9 -#define CPU_POWER10 10 - static int cpuid(void) { int arch = _system_configuration.implementation; From b677d0d5fd175768e63d02253b12d1b0ccb2d242 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 2 Oct 2023 13:09:12 -0500 Subject: [PATCH 333/718] Adding missing endif --- driver/others/dynamic_power.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 7b0b4ea01..6ed26ad1e 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -234,6 +234,7 @@ static int __builtin_cpu_supports(char* arg) return 0; } #endif +#endif static gotoblas_t *get_coretype(void) { From a8c90eb3ed5cae583bdc289846fe7d37fdc42d28 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 3 Oct 2023 10:24:04 -0500 Subject: [PATCH 334/718] Added cpu_is --- driver/others/dynamic_power.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 6ed26ad1e..252e409b3 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -233,8 +233,27 @@ static int __builtin_cpu_supports(char* arg) } return 0; } + +static int __builtin_cpu_is(char *arg) +{ + static int ipinfo = -1; + if (ipinfo < 0) { + ipinfo = cpuid(); + } + if (ipinfo == CPU_POWER10) { + if (!strcmp(arg, "power10") return 1; + } else if (ipinfo == CPU_POWER9) { + if (!strcmp(arg, "power9") return 1; + } else if (ipinfo == CPU_POWER8) { + if (!strcmp(arg, "power8") return 1; + } else if (ipinfo == CPU_POWER6) { + if (!strcmp(arg, "power6") return 1; + } else { + return 0; + } +} #endif -#endif +#endif /* _AIX */ static gotoblas_t *get_coretype(void) { From 2d0b2334259d41c2003b51a07580dbd25cfe267c Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 3 Oct 2023 10:26:14 -0500 Subject: [PATCH 335/718] Fix missing parens. --- driver/others/dynamic_power.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 252e409b3..734122178 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -241,13 +241,13 @@ static int __builtin_cpu_is(char *arg) ipinfo = cpuid(); } if (ipinfo == CPU_POWER10) { - if (!strcmp(arg, "power10") return 1; + if (!strcmp(arg, "power10")) return 1; } else if (ipinfo == CPU_POWER9) { - if (!strcmp(arg, "power9") return 1; + if (!strcmp(arg, "power9")) return 1; } else if (ipinfo == CPU_POWER8) { - if (!strcmp(arg, "power8") return 1; + if (!strcmp(arg, "power8")) return 1; } else if (ipinfo == CPU_POWER6) { - if (!strcmp(arg, "power6") return 1; + if (!strcmp(arg, "power6")) return 1; } else { return 0; } From 09212f84bff0ca8173f928c59ec81da3ab00933b Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 3 Oct 2023 12:23:21 -0500 Subject: [PATCH 336/718] Fix default case for cpu_is. --- driver/others/dynamic_power.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 734122178..252baaeeb 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -248,9 +248,8 @@ static int __builtin_cpu_is(char *arg) if (!strcmp(arg, "power8")) return 1; } else if (ipinfo == CPU_POWER6) { if (!strcmp(arg, "power6")) return 1; - } else { - return 0; } + return 0; } #endif #endif /* _AIX */ From 3cc72a3797ac050841975ff38d317f34ecfeb503 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 4 Oct 2023 09:54:37 -0500 Subject: [PATCH 337/718] Only include cpu_id and cpu_supports in AIX and fix parameter types. --- driver/others/dynamic_power.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 252baaeeb..c01d112bc 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -36,7 +36,7 @@ char *gotoblas_corename(void) { return corename[0]; } -#if defined(__clang__) +#if defined(__clang__) && !defined(_AIX) static int __builtin_cpu_supports(char* arg) { return 0; @@ -50,7 +50,7 @@ static int __builtin_cpu_supports(char* arg) #define CPU_POWER9 9 #define CPU_POWER10 10 -#if defined(C_PGI) || defined(__clang__) +#if defined(C_PGI) || (defined(__clang__) && !defined(_AIX)) /* * NV HPC compilers do not yet implement __builtin_cpu_is(). * Fake a version here for use in the CPU detection code below. @@ -213,7 +213,7 @@ static int cpuid(void) } #ifndef __BUILTIN_CPU_SUPPORTS__ -static int __builtin_cpu_supports(char* arg) +static int __builtin_cpu_supports(const char* arg) { static int ipinfo = -1; if (ipinfo < 0) { @@ -234,7 +234,7 @@ static int __builtin_cpu_supports(char* arg) return 0; } -static int __builtin_cpu_is(char *arg) +static int __builtin_cpu_is(const char *arg) { static int ipinfo = -1; if (ipinfo < 0) { From db0805906ba0d7477ff2adad41bf815cd25e9d06 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 4 Oct 2023 14:04:03 -0500 Subject: [PATCH 338/718] powerpc: Fix build errors with Open XL C This patch fixes errors when using Open XL C compiler on AIX. Tested with gcc/xlf and ibm-clang/xlf compiler combinations. --- Makefile.power | 7 +++++++ c_check | 18 +++++++++++++----- kernel/Makefile | 6 ------ 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/Makefile.power b/Makefile.power index 46afb2d4a..ada51b2e1 100644 --- a/Makefile.power +++ b/Makefile.power @@ -98,6 +98,9 @@ endif endif endif +ifeq ($(C_COMPILER), CLANG) +CCOMMON_OPT += -fno-integrated-as +endif # workaround for C->FORTRAN ABI violation in LAPACKE ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -fno-optimize-sibling-calls @@ -133,7 +136,11 @@ ifdef BINARY64 ifeq ($(OSNAME), AIX) +ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mpowerpc64 -maix64 +else +CCOMMON_OPT += -m64 +endif ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mpowerpc64 -maix64 endif diff --git a/c_check b/c_check index 4d12c1674..b018c10a8 100755 --- a/c_check +++ b/c_check @@ -96,11 +96,19 @@ esac defined=0 if [ "$os" = "AIX" ]; then - case "$BINARY" in - 32) compiler_name="$compiler_name -maix32" ;; - 64) compiler_name="$compiler_name -maix64" ;; - esac - defined=1 + if [ "$compiler" = "GCC" ]; then + case "$BINARY" in + 32) compiler_name="$compiler_name -maix32" ;; + 64) compiler_name="$compiler_name -maix64" ;; + esac + defined=1 + else + case "$BINARY" in + 32) compiler_name="$compiler_name -m32" ;; + 64) compiler_name="$compiler_name -m64" ;; + esac + defined=1 + fi fi case "$architecture" in diff --git a/kernel/Makefile b/kernel/Makefile index 1e0a0074f..3f9afd3fa 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,12 +5,6 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system -ifeq ($(ARCH), power) -ifeq ($(C_COMPILER), CLANG) - override CFLAGS += -fno-integrated-as -endif -endif - AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 From c60f9d9c084a97d1c416d63d921a8fcb30b090ac Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 6 Oct 2023 09:49:17 -0500 Subject: [PATCH 339/718] Add missing CPU_POWER5. --- driver/others/dynamic_power.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index c01d112bc..8c5caada0 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -44,6 +44,7 @@ static int __builtin_cpu_supports(char* arg) #endif #define CPU_UNKNOWN 0 +#define CPU_POWER5 5 #define CPU_POWER6 6 #define CPU_POWER7 7 #define CPU_POWER8 8 From 71c6689af4e61cc4891eba3d996fb39920798e37 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 6 Oct 2023 12:20:40 -0500 Subject: [PATCH 340/718] Fix dynamic dispatch to work for clang. --- driver/others/dynamic_power.c | 141 ++++++++++++++-------------------- 1 file changed, 59 insertions(+), 82 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 8c5caada0..c43738ef4 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -36,13 +36,6 @@ char *gotoblas_corename(void) { return corename[0]; } -#if defined(__clang__) && !defined(_AIX) -static int __builtin_cpu_supports(char* arg) -{ - return 0; -} -#endif - #define CPU_UNKNOWN 0 #define CPU_POWER5 5 #define CPU_POWER6 6 @@ -51,7 +44,31 @@ static int __builtin_cpu_supports(char* arg) #define CPU_POWER9 9 #define CPU_POWER10 10 -#if defined(C_PGI) || (defined(__clang__) && !defined(_AIX)) +#ifdef _AIX +#include + +static int cpuid(void) +{ + int arch = _system_configuration.implementation; +#ifdef POWER_6 + if (arch == POWER_6) return CPU_POWER6; +#endif +#ifdef POWER_7 + else if (arch == POWER_7) return CPU_POWER7; +#endif +#ifdef POWER_8 + else if (arch == POWER_8) return CPU_POWER8; +#endif +#ifdef POWER_9 + else if (arch == POWER_9) return CPU_POWER9; +#endif +#ifdef POWER_10 + else if (arch == POWER_10) return CPU_POWER10; +#endif + return CPU_UNKNOWN; +} +#else +#if defined(C_PGI) || defined(__clang__) /* * NV HPC compilers do not yet implement __builtin_cpu_is(). * Fake a version here for use in the CPU detection code below. @@ -61,8 +78,6 @@ static int __builtin_cpu_supports(char* arg) * what was requested. */ -#include - /* * Define POWER processor version table. * @@ -161,79 +176,32 @@ static struct { }, }; -static int __builtin_cpu_is(const char *cpu) { - int i; - uint32_t pvr; - uint32_t cpu_type; +static int cpuid(void) +{ + int i; + uint32_t pvr; + uint32_t cpu_type; - asm("mfpvr %0" : "=r"(pvr)); + asm("mfpvr %0" : "=r"(pvr)); - for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { - if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { - break; - } - } + for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { + if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { + break; + } + } #if defined(DEBUG) - printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, - pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); + printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, + pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); #endif - cpu_type = pvrPOWER[i].cpu_type; - - if (!strcmp(cpu, "power8")) - return cpu_type == CPU_POWER8; - if (!strcmp(cpu, "power9")) - return cpu_type == CPU_POWER9; - return 0; + cpu_type = pvrPOWER[i].cpu_type; + return (int)(cpu_type); } - #endif /* C_PGI */ - -#ifdef _AIX -#include - -static int cpuid(void) -{ - int arch = _system_configuration.implementation; -#ifdef POWER_6 - if (arch == POWER_6) return CPU_POWER6; -#endif -#ifdef POWER_7 - else if (arch == POWER_7) return CPU_POWER7; -#endif -#ifdef POWER_8 - else if (arch == POWER_8) return CPU_POWER8; -#endif -#ifdef POWER_9 - else if (arch == POWER_9) return CPU_POWER9; -#endif -#ifdef POWER_10 - else if (arch == POWER_10) return CPU_POWER10; -#endif - return CPU_UNKNOWN; -} +#endif /* _AIX */ #ifndef __BUILTIN_CPU_SUPPORTS__ -static int __builtin_cpu_supports(const char* arg) -{ - static int ipinfo = -1; - if (ipinfo < 0) { - ipinfo = cpuid(); - } - if (ipinfo >= CPU_POWER10) { - if (!strcmp(arg, "power10")) return 1; - } - if (ipinfo >= CPU_POWER9) { - if (!strcmp(arg, "power9")) return 1; - } - if (ipinfo >= CPU_POWER8) { - if (!strcmp(arg, "power8")) return 1; - } - if (ipinfo >= CPU_POWER6) { - if (!strcmp(arg, "power6")) return 1; - } - return 0; -} +#include static int __builtin_cpu_is(const char *arg) { @@ -241,19 +209,28 @@ static int __builtin_cpu_is(const char *arg) if (ipinfo < 0) { ipinfo = cpuid(); } +#ifdef HAVE_P10_SUPPORT if (ipinfo == CPU_POWER10) { if (!strcmp(arg, "power10")) return 1; - } else if (ipinfo == CPU_POWER9) { + } +#endif + if (ipinfo == CPU_POWER9) { if (!strcmp(arg, "power9")) return 1; } else if (ipinfo == CPU_POWER8) { if (!strcmp(arg, "power8")) return 1; +#ifndef C_PGI } else if (ipinfo == CPU_POWER6) { if (!strcmp(arg, "power6")) return 1; +#endif } return 0; } + +static int __builtin_cpu_supports(const char *arg) +{ + return 0; +} #endif -#endif /* _AIX */ static gotoblas_t *get_coretype(void) { @@ -268,18 +245,18 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER9; #endif #ifdef HAVE_P10_SUPPORT -#ifdef _AIX - if (__builtin_cpu_supports("power10")) +#if defined(_AIX) || defined(__clang__) + if (__builtin_cpu_is("power10")) #else if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) #endif return &gotoblas_POWER10; #endif - /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ + /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ #if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) - if (__builtin_cpu_is("power10")) - return &gotoblas_POWER9; -#endif + if (__builtin_cpu_is("power10")) + return &gotoblas_POWER9; +#endif return NULL; } From 298bf1f240afcac73d306f4c2da35b314c39dba6 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 6 Oct 2023 12:50:28 -0500 Subject: [PATCH 341/718] Reduce differences. --- driver/others/dynamic_power.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index c43738ef4..4e8710bc7 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -178,23 +178,23 @@ static struct { static int cpuid(void) { - int i; - uint32_t pvr; - uint32_t cpu_type; + int i; + uint32_t pvr; + uint32_t cpu_type; - asm("mfpvr %0" : "=r"(pvr)); + asm("mfpvr %0" : "=r"(pvr)); - for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { - if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { - break; - } - } + for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { + if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { + break; + } + } #if defined(DEBUG) - printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, - pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); + printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, + pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); #endif - cpu_type = pvrPOWER[i].cpu_type; + cpu_type = pvrPOWER[i].cpu_type; return (int)(cpu_type); } #endif /* C_PGI */ @@ -252,10 +252,10 @@ static gotoblas_t *get_coretype(void) { #endif return &gotoblas_POWER10; #endif - /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ + /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ #if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) - if (__builtin_cpu_is("power10")) - return &gotoblas_POWER9; + if (__builtin_cpu_is("power10")) + return &gotoblas_POWER9; #endif return NULL; } From 36e08f69946321a7ca3f9ef495d198802e1b5b17 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 6 Oct 2023 13:08:41 -0500 Subject: [PATCH 342/718] One more small change. --- driver/others/dynamic_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 4e8710bc7..311987d31 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -195,7 +195,7 @@ static int cpuid(void) pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); #endif cpu_type = pvrPOWER[i].cpu_type; - return (int)(cpu_type); + return (int)(cpu_type); } #endif /* C_PGI */ #endif /* _AIX */ From 3655632611173f191c22a36d7c9e0950cdcc202e Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 6 Oct 2023 13:11:40 -0500 Subject: [PATCH 343/718] Another small change. --- driver/others/dynamic_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 311987d31..f98fedd45 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -246,7 +246,7 @@ static gotoblas_t *get_coretype(void) { #endif #ifdef HAVE_P10_SUPPORT #if defined(_AIX) || defined(__clang__) - if (__builtin_cpu_is("power10")) + if (__builtin_cpu_is("power10")) #else if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) #endif From 880af052dde230595328d8a19d10e42f39369a43 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 6 Oct 2023 13:41:49 -0500 Subject: [PATCH 344/718] Fix dynamic dispatch P9 for clang. --- driver/others/dynamic_power.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index f98fedd45..db04e635f 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -3,7 +3,7 @@ extern gotoblas_t gotoblas_POWER6; extern gotoblas_t gotoblas_POWER8; -#if (!defined __GNUC__) || ( __GNUC__ >= 6) +#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__) extern gotoblas_t gotoblas_POWER9; #endif #ifdef HAVE_P10_SUPPORT @@ -27,7 +27,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_POWER6) return corename[1]; #endif if (gotoblas == &gotoblas_POWER8) return corename[2]; -#if (!defined __GNUC__) || ( __GNUC__ >= 6) +#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__) if (gotoblas == &gotoblas_POWER9) return corename[3]; #endif #ifdef HAVE_P10_SUPPORT @@ -240,7 +240,7 @@ static gotoblas_t *get_coretype(void) { #endif if (__builtin_cpu_is("power8")) return &gotoblas_POWER8; -#if (!defined __GNUC__) || ( __GNUC__ >= 6) +#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__) if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; #endif @@ -281,7 +281,7 @@ static gotoblas_t *force_coretype(char * coretype) { case 1: return (&gotoblas_POWER6); #endif case 2: return (&gotoblas_POWER8); -#if (!defined __GNUC__) || ( __GNUC__ >= 6) +#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__) case 3: return (&gotoblas_POWER9); #endif #ifdef HAVE_P10_SUPPORT From b626544ca32396cbd6bd138d75669bee05330877 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Oct 2023 22:31:03 +0200 Subject: [PATCH 345/718] complete function prototypes and remove unused functions --- ctest/c_cblat1c.c | 303 +++-------------------------- ctest/c_cblat2c.c | 366 +++++++---------------------------- ctest/c_cblat3c.c | 124 ------------ ctest/c_dblat1c.c | 192 +++--------------- ctest/c_dblat2c.c | 352 +++++++-------------------------- ctest/c_dblat3c.c | 435 ++++++----------------------------------- ctest/c_sblat1c.c | 318 +++--------------------------- ctest/c_sblat2c.c | 481 +++++++--------------------------------------- ctest/c_sblat3c.c | 437 ++++++----------------------------------- ctest/c_zblat1c.c | 304 +++-------------------------- ctest/c_zblat2c.c | 367 +++++++---------------------------- ctest/c_zblat3c.c | 463 +++++++------------------------------------- 12 files changed, 549 insertions(+), 3593 deletions(-) diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c index 8c0dd140c..2f84da43b 100644 --- a/ctest/c_cblat1c.c +++ b/ctest/c_cblat1c.c @@ -242,251 +242,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -#if 0 -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - /* Common Block Declarations */ @@ -396,7 +273,7 @@ static integer c_n1 = -1; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program */ int main() +/* Main program */ int main(void) { /* Initialized data */ @@ -414,17 +291,21 @@ static logical c_false = FALSE_; static logical same; static integer ninc, nbet, ntra; static logical rewi; - extern /* Subroutine */ int cchk1_(), cchk2_(), cchk3_(), cchk4_(), - cchk5_(), cchk6_(); + extern /* Subroutine */ int cchk1_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, integer*, ftnlen); + extern /* Subroutine */ int cchk2_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, integer*, ftnlen); + extern /* Subroutine */ int cchk3_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen); + extern /* Subroutine */ int cchk4_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen); + extern /* Subroutine */ int cchk5_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen); + extern /* Subroutine */ int cchk6_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen); static complex a[4225] /* was [65][65] */; static real g[65]; static integer i__, j, n; static logical fatal; static complex x[65], y[65], z__[130]; - extern doublereal sdiff_(); + extern doublereal sdiff_(real*, real*); static logical trace; static integer nidim; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static char snaps[32], trans[1]; static integer isnum; static logical ltest[17]; @@ -438,11 +319,11 @@ static logical c_false = FALSE_; static char snamet[12]; static real thresh; static logical rorder; - extern /* Subroutine */ int cc2chke_(); + extern /* Subroutine */ void cc2chke_(char*, ftnlen); static integer layout; static logical ltestt, tsterr; static complex alf[7]; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static integer inc[7], nkb; static complex bet[7]; static real eps, err; @@ -983,22 +864,7 @@ L240: } /* MAIN__ */ -/* Subroutine */ int cchk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -complex *alf; -integer *nbet; -complex *bet; -integer *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -real *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk1_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, complex* alf, integer* nbet, complex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1015,10 +881,10 @@ ftnlen sname_len; static integer incx, incy; static logical full, tran, null; static integer i__, m, n; - extern /* Subroutine */ int cmake_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); static complex alpha; static logical isame[13]; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static integer incxs, incys; @@ -1026,14 +892,15 @@ ftnlen sname_len; static integer ia, ib, ic; static logical banded; static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns; - extern /* Subroutine */ int ccgbmv_(), ccgemv_(); - extern logical lceres_(); + extern /* Subroutine */ int ccgbmv_(integer*, char*, integer*, integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen); + extern /* Subroutine */ void ccgemv_(integer*, char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); static char ctrans[14]; static real errmax; static complex transl; static char transs[1]; static integer laa, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static complex als, bls; static real err; static integer iku, kls, kus; @@ -1448,22 +1315,7 @@ L140: } /* cchk1_ */ -/* Subroutine */ int cchk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -complex *alf; -integer *nbet; -complex *bet; -integer *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -real *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk2_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, complex* alf, integer* nbet, complex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1481,10 +1333,10 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, k, n; - extern /* Subroutine */ int cmake_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); static complex alpha; static logical isame[13]; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; @@ -1495,13 +1347,14 @@ ftnlen sname_len; static integer nc, ik, in; static logical packed; static integer nk, ks, ix, iy, ns, lx, ly; - extern /* Subroutine */ int cchbmv_(), cchemv_(); - extern logical lceres_(); - extern /* Subroutine */ int cchpmv_(); + extern /* Subroutine */ void cchbmv_(integer*, char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen); + extern /* Subroutine */ void cchemv_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cchpmv_(integer*, char*, integer*, complex*, complex*, complex*, integer*, complex*, complex*, integer*, ftnlen); static real errmax; static complex transl; static integer laa, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static complex als, bls; static real err; @@ -1906,19 +1759,7 @@ L130: } /* cchk2_ */ -/* Subroutine */ int cchk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, xt, g, z__, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *xt; -real *g; -complex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk3_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* xt, real* g, complex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1937,10 +1778,10 @@ ftnlen sname_len; static logical full, null; static char uplo[1], cdiag[14]; static integer i__, k, n; - extern /* Subroutine */ int cmake_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); static char diags[1]; static logical isame[13]; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; @@ -1950,17 +1791,19 @@ ftnlen sname_len; static integer nc, ik, in; static logical packed; static integer nk, ks, ix, ns, lx; - extern logical lceres_(); - extern /* Subroutine */ int cctbmv_(), cctbsv_(); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cctbmv_(integer*, char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cctbsv_(integer*, char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen); static char ctrans[14]; - extern /* Subroutine */ int cctpmv_(); + extern /* Subroutine */ void cctpmv_(integer*, char*, char*, char*, integer*, complex*, complex*, integer*, ftnlen, ftnlen, ftnlen); static real errmax; - extern /* Subroutine */ int cctrmv_(), cctpsv_(); + extern /* Subroutine */ void cctrmv_(integer*, char*, char*, char*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cctpsv_(integer*, char*, char*, char*, integer*, complex*, complex*, integer*, ftnlen, ftnlen, ftnlen); static complex transl; - extern /* Subroutine */ int cctrsv_(); + extern /* Subroutine */ void cctrsv_(integer*, char*, char*, char*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen); static char transs[1]; static integer laa, icd, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static integer ict, icu; static real err; @@ -2418,21 +2261,7 @@ L130: } /* cchk3_ */ -/* Subroutine */ int cchk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -complex *alf; -integer *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -real *g; -complex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk4_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; @@ -2444,21 +2273,21 @@ ftnlen sname_len; static integer incx, incy; static logical null; static integer i__, j, m, n; - extern /* Subroutine */ int cmake_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); static complex alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static integer incxs, incys, ia, nc, nd, im, in; - extern /* Subroutine */ int ccgerc_(); + extern /* Subroutine */ void ccgerc_(integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*); static integer ms, ix, iy, ns, lx, ly; - extern /* Subroutine */ int ccgeru_(); - extern logical lceres_(); + extern /* Subroutine */ void ccgeru_(integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); static real errmax; static complex transl; static integer laa, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static complex als; static real err; @@ -2786,21 +2615,7 @@ L150: } /* cchk4_ */ -/* Subroutine */ int cchk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -complex *alf; -integer *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -real *g; -complex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk5_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2818,10 +2633,12 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, j, n; - extern /* Subroutine */ int cmake_(), ccher_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void ccher_(integer*, char*, integer*, real*, complex*, integer*, complex*, integer*, ftnlen); static complex alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int cchpr_(), cmvch_(); + extern /* Subroutine */ void cchpr_(integer*, char*, integer*, real*, complex*, integer*, complex*, ftnlen); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; @@ -2832,11 +2649,11 @@ ftnlen sname_len; static logical packed; static integer ix, ns, lx; static real ralpha; - extern logical lceres_(); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); static real errmax; static complex transl; static integer laa, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static real err; /* Tests CHER and CHPR. */ @@ -3160,21 +2977,7 @@ L130: } /* cchk5_ */ -/* Subroutine */ int cchk6_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -complex *alf; -integer *ninc, *inc, *nmax, *incmax; -complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -real *g; -complex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int cchk6_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -3192,25 +2995,26 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, j, n; - extern /* Subroutine */ int cmake_(); + extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen); static complex alpha, w[2]; static logical isame[13]; - extern /* Subroutine */ int cmvch_(); + extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; static integer incxs, incys; static logical upper; static char uplos[1]; - extern /* Subroutine */ int ccher2_(), cchpr2_(); + extern /* Subroutine */ void ccher2_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*, ftnlen); + extern /* Subroutine */ void cchpr2_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, ftnlen); static integer ia, ja, ic, nc, jj, lj, in; static logical packed; static integer ix, iy, ns, lx, ly; - extern logical lceres_(); + extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen); static real errmax; static complex transl; static integer laa, lda; - extern logical lce_(); + extern logical lce_(complex*, complex*, integer*); static complex als; static real err; @@ -3597,24 +3401,7 @@ L170: } /* cchk6_ */ -/* Subroutine */ int cmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, - incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len) -char *trans; -integer *m, *n; -complex *alpha, *a; -integer *nmax; -complex *x; -integer *incx; -complex *beta, *y; -integer *incy; -complex *yt; -real *g; -complex *yy; -real *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen trans_len; +/* Subroutine */ int cmvch_(char* trans, integer* m, integer* n, complex* alpha, complex* a, integer* nmax, complex* x, integer* incx, complex* beta, complex* y, integer* incy, complex* yt, real* g, complex* yy, real* eps, real* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len) { /* System generated locals */ @@ -3812,9 +3599,7 @@ L80: } /* cmvch_ */ -logical lce_(ri, rj, lr) -complex *ri, *rj; -integer *lr; +logical lce_(complex* ri, complex* rj, integer* lr) { /* System generated locals */ integer i__1, i__2, i__3; @@ -3861,13 +3646,7 @@ L30: } /* lce_ */ -logical lceres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -complex *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lceres_(char* type__, char* uplo, integer* m, integer* n, complex* aa, complex* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; @@ -3960,9 +3739,7 @@ L80: } /* lceres_ */ -/* Complex */ VOID cbeg_( ret_val, reset) -complex * ret_val; -logical *reset; +/* Complex */ VOID cbeg_(complex* ret_val, logical* reset) { /* System generated locals */ real r__1, r__2; @@ -4023,8 +3800,7 @@ L10: } /* cbeg_ */ -doublereal sdiff_(x, y) -real *x, *y; +doublereal sdiff_(real* x, real* y) { /* System generated locals */ real ret_val; @@ -4044,19 +3820,7 @@ real *x, *y; } /* sdiff_ */ -/* Subroutine */ int cmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, - ku, reset, transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -complex *a; -integer *nmax; -complex *aa; -integer *lda, *kl, *ku; -logical *reset; -complex *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int cmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, complex* a, integer* nmax, complex* aa, integer* lda, integer* kl, integer* ku, logical* reset, complex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; @@ -4064,7 +3828,7 @@ ftnlen diag_len; complex q__1, q__2; /* Local variables */ - extern /* Complex */ VOID cbeg_(); + extern /* Complex */ VOID cbeg_(complex*, logical*); static integer ibeg, iend, ioff; static logical unit; static integer i__, j; diff --git a/ctest/c_cblat3c.c b/ctest/c_cblat3c.c index 1f4b967b0..5ad9b8bd8 100644 --- a/ctest/c_cblat3c.c +++ b/ctest/c_cblat3c.c @@ -242,130 +242,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - - /* Common Block Declarations */ diff --git a/ctest/c_dblat1c.c b/ctest/c_dblat1c.c index bf2f7a781..f0141f2a5 100644 --- a/ctest/c_dblat1c.c +++ b/ctest/c_dblat1c.c @@ -21,19 +21,6 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; -#ifdef _MSC_VER -static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} -static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} -static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} -static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} -#else -static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} -static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} -#endif -#define pCf(z) (*_pCf(z)) -#define pCd(z) (*_pCd(z)) typedef int logical; typedef short int shortlogical; typedef char logical1; @@ -242,124 +229,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif /* Common Block Declarations */ @@ -375,16 +244,16 @@ struct { static integer c__1 = 1; static doublereal c_b34 = 1.; -/* Main program */ int main() +/* Main program */ int main(void) { /* Initialized data */ static doublereal sfac = 9.765625e-4; /* Local variables */ - extern /* Subroutine */ int check0_(), check1_(), check2_(), check3_(); + extern /* Subroutine */ int check0_(doublereal*), check1_(doublereal*), check2_(doublereal*), check3_(doublereal*); static integer ic; - extern /* Subroutine */ int header_(); + extern /* Subroutine */ int header_(void); /* Test program for the DOUBLE PRECISION Level 1 CBLAS. */ /* Based upon the original CBLAS test routine together with: */ @@ -431,7 +300,7 @@ static doublereal c_b34 = 1.; } /* MAIN__ */ -/* Subroutine */ int header_() +/* Subroutine */ int header_(void) { /* Initialized data */ @@ -450,8 +319,7 @@ static doublereal c_b34 = 1.; } /* header_ */ -/* Subroutine */ int check0_(sfac) -doublereal *sfac; +/* Subroutine */ int check0_(doublereal* sfac) { /* Initialized data */ @@ -464,7 +332,7 @@ doublereal *sfac; /* Local variables */ static integer k; - extern /* Subroutine */ int drotgtest_(), stest1_(); + extern /* Subroutine */ int drotgtest_(doublereal*,doublereal*,doublereal*,doublereal*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); static doublereal sa, sb, sc, ss; /* .. Parameters .. */ @@ -509,8 +377,7 @@ L40: return 0; } /* check0_ */ -/* Subroutine */ int check1_(sfac) -doublereal *sfac; +/* Subroutine */ int check1_(doublereal* sfac) { /* Initialized data */ @@ -535,14 +402,14 @@ doublereal *sfac; /* Local variables */ static integer i__; - extern doublereal dnrm2test_(); + extern doublereal dnrm2test_(int*, doublereal*, int*); static doublereal stemp[1], strue[8]; - extern /* Subroutine */ int stest_(), dscaltest_(); - extern doublereal dasumtest_(); - extern /* Subroutine */ int itest1_(), stest1_(); + extern /* Subroutine */ int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(int*,doublereal*,doublereal*,int*); + extern doublereal dasumtest_(int*,doublereal*,int*); + extern /* Subroutine */ int itest1_(int*,int*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); static doublereal sx[8]; static integer np1; - extern integer idamaxtest_(); + extern integer idamaxtest_(int*,doublereal*,int*); static integer len; /* .. Parameters .. */ @@ -603,8 +470,7 @@ doublereal *sfac; return 0; } /* check1_ */ -/* Subroutine */ int check2_(sfac) -doublereal *sfac; +/* Subroutine */ int check2_(doublereal* sfac) { /* Initialized data */ @@ -649,10 +515,10 @@ doublereal *sfac; /* Local variables */ static integer lenx, leny; - extern doublereal ddottest_(); + extern doublereal ddottest_(int*,doublereal*,int*,doublereal*,int*); static integer i__, j, ksize; - extern /* Subroutine */ int stest_(), dcopytest_(), dswaptest_(), - daxpytest_(), stest1_(); + extern /* Subroutine */ int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(int*,doublereal*,int*,doublereal*,int*), dswaptest_(int*,doublereal*,int*,doublereal*,int*), + daxpytest_(int*,doublereal*,doublereal*,int*,doublereal*,int*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); static integer ki, kn, mx, my; static doublereal sx[7], sy[7], stx[7], sty[7]; @@ -733,8 +599,7 @@ doublereal *sfac; return 0; } /* check2_ */ -/* Subroutine */ int check3_(sfac) -doublereal *sfac; +/* Subroutine */ int check3_(doublereal* sfac) { /* Initialized data */ @@ -753,9 +618,9 @@ doublereal *sfac; ; /* Local variables */ - extern /* Subroutine */ int drottest_(); + extern /* Subroutine */ int drottest_(int*,doublereal*,int*,doublereal*,int*,doublereal*,doublereal*); static integer i__, k, ksize; - extern /* Subroutine */int stest_(), drotmtest_(); + extern /* Subroutine */int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(int*,doublereal*,int*,doublereal*,int*,doublereal*); static integer ki, kn; static doublereal dparam[5], sx[10], sy[10], stx[10], sty[10]; @@ -826,9 +691,7 @@ doublereal *sfac; return 0; } /* check3_ */ -/* Subroutine */ int stest_(len, scomp, strue, ssize, sfac) -integer *len; -doublereal *scomp, *strue, *ssize, *sfac; +/* Subroutine */ int stest_(int* len, doublereal* scomp, doublereal* strue, doublereal* ssize, doublereal* sfac) { /* System generated locals */ integer i__1; @@ -836,7 +699,7 @@ doublereal *scomp, *strue, *ssize, *sfac; /* Local variables */ static integer i__; - extern doublereal sdiff_(); + extern doublereal sdiff_(doublereal*,doublereal*); static doublereal sd; /* ********************************* STEST ************************** */ @@ -892,11 +755,10 @@ L40: } /* stest_ */ -/* Subroutine */ int stest1_(scomp1, strue1, ssize, sfac) -doublereal *scomp1, *strue1, *ssize, *sfac; +/* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac) { static doublereal scomp[1], strue[1]; - extern /* Subroutine */ int stest_(); + extern /* Subroutine */ int stest_(int*, doublereal*, doublereal*, doublereal*, doublereal*); /* ************************* STEST1 ***************************** */ @@ -923,8 +785,7 @@ doublereal *scomp1, *strue1, *ssize, *sfac; return 0; } /* stest1_ */ -doublereal sdiff_(sa, sb) -doublereal *sa, *sb; +doublereal sdiff_(doublereal* sa, doublereal* sb) { /* System generated locals */ doublereal ret_val; @@ -938,8 +799,7 @@ doublereal *sa, *sb; return ret_val; } /* sdiff_ */ -/* Subroutine */ int itest1_(icomp, itrue) -integer *icomp, *itrue; +/* Subroutine */ int itest1_(int* icomp, int* itrue) { /* Local variables */ static integer id; @@ -1188,4 +1048,4 @@ doublereal *dparam; return 0; } /* drotm_ */ -#endif \ No newline at end of file +#endif diff --git a/ctest/c_dblat2c.c b/ctest/c_dblat2c.c index f94dbc1fe..547aa808e 100644 --- a/ctest/c_dblat2c.c +++ b/ctest/c_dblat2c.c @@ -242,129 +242,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - /* Common Block Declarations */ @@ -395,7 +272,7 @@ static integer c_n1 = -1; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program */ int main() +/* Main program */ int main(void) { /* Initialized data */ @@ -413,17 +290,21 @@ static logical c_false = FALSE_; static logical same; static integer ninc, nbet, ntra; static logical rewi; - extern /* Subroutine */ int dchk1_(), dchk2_(), dchk3_(), dchk4_(), - dchk5_(), dchk6_(); + extern /* Subroutine */ int dchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk6_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); static doublereal a[4225] /* was [65][65] */, g[65]; static integer i__, j; - extern doublereal ddiff_(); + extern doublereal ddiff_(doublereal*, doublereal*); static integer n; static logical fatal; static doublereal x[65], y[65], z__[130]; static logical trace; static integer nidim; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static char snaps[32], trans[1]; static integer isnum; static logical ltest[16]; @@ -437,11 +318,11 @@ static logical c_false = FALSE_; static char snamet[12]; static doublereal thresh; static logical rorder; - extern /* Subroutine */ int cd2chke_(); + extern /* Subroutine */ void cd2chke_(char*, ftnlen); static integer layout; static logical ltestt, tsterr; static doublereal alf[7]; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static integer inc[7], nkb; static doublereal bet[7],eps,err; char tmpchar; @@ -977,21 +858,7 @@ L240: } /* MAIN__ */ -/* Subroutine */ int dchk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1007,10 +874,10 @@ ftnlen sname_len; static integer incx, incy; static logical full, tran, null; static integer i__, m, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static integer incxs, incys; @@ -1018,13 +885,14 @@ ftnlen sname_len; static integer ia, ib, ic; static logical banded; static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns; - extern /* Subroutine */ int cdgbmv_(), cdgemv_(); - extern logical lderes_(); + extern /* Subroutine */ void cdgbmv_(integer*, char*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ void cdgemv_(integer*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static char ctrans[14]; static doublereal errmax, transl; static char transs[1]; static integer laa, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, bls, err; static integer iku, kls, kus; @@ -1429,21 +1297,7 @@ L140: } /* dchk1_ */ -/* Subroutine */ int dchk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1460,10 +1314,10 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, k, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; @@ -1474,12 +1328,13 @@ ftnlen sname_len; static integer nc, ik, in; static logical packed; static integer nk, ks, ix, iy, ns, lx, ly; - extern logical lderes_(); - extern /* Subroutine */ int cdsbmv_(), cdspmv_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cdsbmv_(integer*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ void cdspmv_(integer*, char*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen); static doublereal errmax, transl; - extern /* Subroutine */ int cdsymv_(); + extern /* Subroutine */ void cdsymv_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen); static integer laa, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, bls, err; @@ -1882,17 +1737,7 @@ L130: } /* dchk2_ */ -/* Subroutine */ int dchk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, xt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *xt, *g, *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* xt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1911,10 +1756,10 @@ ftnlen sname_len; static logical full, null; static char uplo[1], cdiag[14]; static integer i__, k, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static char diags[1]; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; @@ -1924,16 +1769,19 @@ ftnlen sname_len; static integer nc, ik, in; static logical packed; static integer nk, ks, ix, ns, lx; - extern logical lderes_(); - extern /* Subroutine */ int cdtbmv_(), cdtbsv_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cdtbmv_(integer*, char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cdtbsv_(integer*, char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static char ctrans[14]; static doublereal errmax; - extern /* Subroutine */ int cdtpmv_(), cdtrmv_(); + extern /* Subroutine */ void cdtpmv_(integer*, char*, char*, char*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cdtrmv_(integer*, char*, char*, char*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static doublereal transl; - extern /* Subroutine */ int cdtpsv_(), cdtrsv_(); + extern /* Subroutine */ void cdtpsv_(integer*, char*, char*, char*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cdtrsv_(integer*, char*, char*, char*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static char transs[1]; static integer laa, icd, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static integer ict, icu; static doublereal err; @@ -2388,19 +2236,7 @@ L130: } /* dchk3_ */ -/* Subroutine */ int dchk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; @@ -2411,17 +2247,18 @@ ftnlen sname_len; static integer incx, incy; static logical null; static integer i__, j, m, n; - extern /* Subroutine */ int dmake_(), cdger_(); + extern /* Subroutine */ void cdger_(integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, integer*); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static integer incxs, incys, ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax, transl; static integer laa, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, err; @@ -2727,19 +2564,7 @@ L150: } /* dchk4_ */ -/* Subroutine */ int dchk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2757,25 +2582,25 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, j, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; - extern /* Subroutine */ int cdspr_(); + extern /* Subroutine */ void cdspr_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, ftnlen); static logical reset; static char cuplo[14]; static integer incxs; - extern /* Subroutine */ int cdsyr_(); + extern /* Subroutine */ void cdsyr_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen); static logical upper; static char uplos[1]; static integer ia, ja, ic, nc, jj, lj, in; static logical packed; static integer ix, ns, lx; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax, transl; static integer laa, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, err; @@ -3096,19 +2921,7 @@ L130: } /* dchk5_ */ -/* Subroutine */ int dchk6_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *ninc, *inc, *nmax, *incmax; -doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk6_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -3125,24 +2938,25 @@ ftnlen sname_len; static logical full, null; static char uplo[1]; static integer i__, j, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha, w[2]; static logical isame[13]; - extern /* Subroutine */ int dmvch_(); + extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; static integer incxs, incys; static logical upper; static char uplos[1]; - extern /* Subroutine */ int cdspr2_(), cdsyr2_(); + extern /* Subroutine */ void cdspr2_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, ftnlen); + extern /* Subroutine */ void cdsyr2_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen); static integer ia, ja, ic, nc, jj, lj, in; static logical packed; static integer ix, iy, ns, lx, ly; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax, transl; static integer laa, lda; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, err; /* Tests DSYR2 and DSPR2. */ @@ -3508,25 +3322,13 @@ L170: } /* dchk6_ */ -/* Subroutine */ int dmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, - ku, reset, transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -doublereal *a; -integer *nmax; -doublereal *aa; -integer *lda, *kl, *ku; -logical *reset; -doublereal *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int dmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublereal* a, integer* nmax, doublereal* aa, integer* lda, integer* kl, integer* ku, logical* reset, doublereal* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ - extern doublereal dbeg_(); + extern doublereal dbeg_(logical* ); static integer ibeg, iend, ioff; static logical unit; static integer i__, j; @@ -3752,28 +3554,14 @@ ftnlen diag_len; } /* dmake_ */ -/* Subroutine */ int dmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, - incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len) -char *trans; -integer *m, *n; -doublereal *alpha, *a; -integer *nmax; -doublereal *x; -integer *incx; -doublereal *beta, *y; -integer *incy; -doublereal *yt, *g, *yy, *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen trans_len; +/* Subroutine */ int dmvch_(char* trans, integer* m, integer* n, doublereal* alpha, doublereal* a, integer* nmax, doublereal* x, integer* incx, doublereal* beta, doublereal* y, integer* incy, doublereal* yt, doublereal* g, doublereal* yy, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; doublereal d__1; /* Builtin functions */ - double sqrt(); + double sqrt(double); /* Local variables */ static doublereal erri; @@ -3902,9 +3690,7 @@ L70: } /* dmvch_ */ -logical lde_(ri, rj, lr) -doublereal *ri, *rj; -integer *lr; +logical lde_(doublereal* ri, doublereal* rj, integer* lr) { /* System generated locals */ integer i__1; @@ -3949,13 +3735,7 @@ L30: } /* lde_ */ -logical lderes_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -doublereal *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lderes_(char* type__, char* uplo, integer* m, integer* n, doublereal* aa, doublereal* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2; @@ -4042,8 +3822,7 @@ L80: } /* lderes_ */ -doublereal dbeg_(reset) -logical *reset; +doublereal dbeg_(logical* reset) { /* System generated locals */ doublereal ret_val; @@ -4094,8 +3873,7 @@ L10: } /* dbeg_ */ -doublereal ddiff_(x, y) -doublereal *x, *y; +doublereal ddiff_(doublereal* x, doublereal* y) { /* System generated locals */ doublereal ret_val; diff --git a/ctest/c_dblat3c.c b/ctest/c_dblat3c.c index 05d6b65b0..dc3d6f9e7 100644 --- a/ctest/c_dblat3c.c +++ b/ctest/c_dblat3c.c @@ -242,129 +242,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - /* Common Block Declarations */ @@ -393,7 +270,7 @@ static logical c_true = TRUE_; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program MAIN__() */ int main() +/* Main program MAIN__() */ int main(void) { /* Initialized data */ @@ -403,25 +280,24 @@ static logical c_false = FALSE_; integer i__1, i__2, i__3; doublereal d__1; - /* Builtin functions */ - integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), - e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe(); - integer f_clos(); /* Local variables */ static integer nalf, idim[9]; static logical same; static integer nbet, ntra; static logical rewi; - extern /* Subroutine */ int dchk1_(), dchk2_(), dchk3_(), dchk4_(), - dchk5_(); + extern /* Subroutine */ int dchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int dchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen); +/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* ab, doublereal* aa, doublereal* as, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, doublereal* w, integer* iorder, ftnlen sname_len); static doublereal c__[4225] /* was [65][65] */, g[65]; static integer i__, j; - extern doublereal ddiff_(); + extern doublereal ddiff_(doublereal*, doublereal*); static integer n; static logical fatal; static doublereal w[130]; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical trace; static integer nidim; static char snaps[32]; @@ -433,11 +309,11 @@ static logical c_false = FALSE_; static char snamet[12], transa[1], transb[1]; static doublereal thresh; static logical rorder; - extern /* Subroutine */ int cd3chke_(); + extern /* Subroutine */ void cd3chke_(char*, ftnlen); static integer layout; static logical ltestt, tsterr; static doublereal alf[7]; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal bet[7], eps, err; char tmpchar; @@ -907,21 +783,7 @@ L230: } /* MAIN__ */ -/* Subroutine */ int dchk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *nmax; -doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -931,29 +793,27 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static doublereal beta; static integer ldas, ldbs, ldcs; static logical same, null; static integer i__, k, m, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical isame[13], trana, tranb; static integer nargs; static logical reset; - extern /* Subroutine */ void dprcn1_(); + extern /* Subroutine */ void dprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ia, ib, ma, mb, na, nb, nc, ik, im, in; - extern /* Subroutine */ int cdgemm_(); + extern /* Subroutine */ void cdgemm_(integer*, char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static integer ks, ms, ns; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static char tranas[1], tranbs[1], transa[1], transb[1]; static doublereal errmax; static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als, bls, err; /* Tests DGEMM. */ @@ -1283,23 +1143,8 @@ L130: } /* dchk1_ */ -/* Subroutine */ void dprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k, - alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *transa, *transb; -integer *m, *n, *k; -doublereal *alpha; -integer *lda, *ldb; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ void dprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char crc[14], cta[14], ctb[14]; @@ -1328,21 +1173,7 @@ ftnlen transb_len; } /* dprcn1_ */ -/* Subroutine */ int dchk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *nmax; -doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1353,8 +1184,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static doublereal beta; @@ -1364,21 +1193,21 @@ ftnlen sname_len; static logical left, null; static char uplo[1]; static integer i__, m, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical isame[13]; static char sides[1]; static integer nargs; static logical reset; static char uplos[1]; - extern /* Subroutine */ void dprcn2_(); + extern /* Subroutine */ void dprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ia, ib, na, nc, im, in, ms, ns; - extern logical lderes_(); - extern /* Subroutine */ int cdsymm_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cdsymm_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax; static integer laa, lbb, lda, lcc, ldb, ldc; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static integer ics; static doublereal als, bls; static integer icu; @@ -1692,23 +1521,8 @@ L120: } /* dchk2_ */ -/* Subroutine */ void dprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha, - lda, ldb, beta, ldc, sname_len, side_len, uplo_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo; -integer *m, *n; -doublereal *alpha; -integer *lda, *ldb; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; +/* Subroutine */ void dprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char cs[14], cu[14], crc[14]; @@ -1733,19 +1547,7 @@ ftnlen uplo_len; } /* dprcn2_ */ -/* Subroutine */ int dchk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__, - iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *nmax; -doublereal *a, *aa, *as, *b, *bb, *bs, *ct, *g, *c__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* ct, doublereal* g, doublereal* c__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1766,24 +1568,24 @@ ftnlen sname_len; static logical left, null; static char uplo[1]; static integer i__, j, m, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; static char diags[1]; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical isame[13]; static char sides[1]; static integer nargs; static logical reset; static char uplos[1]; - extern /* Subroutine */ void dprcn3_(); + extern /* Subroutine */ void dprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen); static integer ia, na, nc, im, in, ms, ns; - extern logical lderes_(); - extern /* Subroutine */ int cdtrmm_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cdtrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); static char tranas[1], transa[1]; - extern /* Subroutine */ int cdtrsm_(); + extern /* Subroutine */ void cdtrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); static doublereal errmax; static integer laa, icd, lbb, lda, ldb; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static integer ics; static doublereal als; static integer ict, icu; @@ -2165,24 +1967,8 @@ L160: } /* dchk3_ */ -/* Subroutine */ void dprcn3_(nout, nc, sname, iorder, side, uplo, transa, - diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, - transa_len, diag_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo, *transa, *diag; -integer *m, *n; -doublereal *alpha; -integer *lda, *ldb; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; -ftnlen transa_len; -ftnlen diag_len; +/* Subroutine */ void dprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublereal* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cd[14], cs[14], cu[14], crc[14]; @@ -2219,21 +2005,7 @@ ftnlen diag_len; } /* dprcn3_ */ -/* Subroutine */ int dchk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *nmax; -doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2244,8 +2016,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static doublereal beta; @@ -2255,23 +2025,23 @@ ftnlen sname_len; static logical tran, null; static char uplo[1]; static integer i__, j, k, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical isame[13]; static integer nargs; static logical reset; static char trans[1]; static logical upper; static char uplos[1]; - extern /* Subroutine */ void dprcn4_(); + extern /* Subroutine */ void dprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax; - extern /* Subroutine */ int cdsyrk_(); + extern /* Subroutine */ void cdsyrk_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static char transs[1]; static integer laa, lda, lcc, ldc; - extern logical lde_(); + extern logical lde_(doublereal*, doublereal*, integer*); static doublereal als; static integer ict, icu; static doublereal err; @@ -2586,23 +2356,8 @@ L130: } /* dchk4_ */ -/* Subroutine */ void dprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublereal *alpha; -integer *lda; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ void dprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -2629,21 +2384,7 @@ ftnlen transa_len; } /* dprcn4_ */ -/* Subroutine */ int dchk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, - c__, cc, cs, ct, g, w, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublereal *alf; -integer *nbet; -doublereal *bet; -integer *nmax; -doublereal *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct, *g, *w; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* ab, doublereal* aa, doublereal* as, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, doublereal* w, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2653,8 +2394,6 @@ ftnlen sname_len; /* System generated locals */ integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static integer jjab; @@ -2665,23 +2404,23 @@ ftnlen sname_len; static logical tran, null; static char uplo[1]; static integer i__, j, k, n; - extern /* Subroutine */ int dmake_(); + extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen); static doublereal alpha; - extern /* Subroutine */ int dmmch_(); + extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical isame[13]; static integer nargs; static logical reset; static char trans[1]; static logical upper; static char uplos[1]; - extern /* Subroutine */ void dprcn5_(); + extern /* Subroutine */ void dprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns; - extern logical lderes_(); + extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal errmax; static char transs[1]; static integer laa, lbb, lda, lcc, ldb, ldc; - extern logical lde_(); - extern /* Subroutine */ int cdsyr2k_(); + extern logical lde_(doublereal*, doublereal*, integer*); + extern /* Subroutine */ void cdsyr2k_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen); static doublereal als; static integer ict, icu; static doublereal err; @@ -3048,23 +2787,8 @@ L160: } /* dchk5_ */ -/* Subroutine */ void dprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublereal *alpha; -integer *lda, *ldb; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ void dprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -3091,25 +2815,13 @@ ftnlen transa_len; } /* dprcn5_ */ -/* Subroutine */ int dmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset, - transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -doublereal *a; -integer *nmax; -doublereal *aa; -integer *lda; -logical *reset; -doublereal *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int dmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublereal* a, integer* nmax, doublereal* aa, integer* lda, logical* reset, doublereal* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; /* Local variables */ - extern doublereal dbeg_(); + extern doublereal dbeg_(logical*); static integer ibeg, iend; static logical unit; static integer i__, j; @@ -3241,25 +2953,7 @@ ftnlen diag_len; } /* dmake_ */ -/* Subroutine */ int dmmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, - beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, - transa_len, transb_len) -char *transa, *transb; -integer *m, *n, *kk; -doublereal *alpha, *a; -integer *lda; -doublereal *b; -integer *ldb; -doublereal *beta, *c__; -integer *ldc; -doublereal *ct, *g, *cc; -integer *ldcc; -doublereal *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ int dmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublereal* alpha, doublereal* a, integer* lda, doublereal* b, integer* ldb, doublereal* beta, doublereal* c__, integer* ldc, doublereal* ct, doublereal* g, doublereal* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len) { /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, @@ -3267,8 +2961,7 @@ ftnlen transb_len; doublereal d__1, d__2; /* Builtin functions */ - double sqrt(); - integer s_wsfe(), e_wsfe(), do_fio(); + double sqrt(double); /* Local variables */ static doublereal erri; @@ -3432,9 +3125,7 @@ L150: } /* dmmch_ */ -logical lde_(ri, rj, lr) -doublereal *ri, *rj; -integer *lr; +logical lde_(doublereal* ri, doublereal* rj, integer* lr) { /* System generated locals */ integer i__1; @@ -3481,13 +3172,7 @@ L30: } /* lde_ */ -logical lderes_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -doublereal *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lderes_(char* type__, char* uplo, integer* m, integer* n, doublereal* aa, doublereal* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2; @@ -3576,8 +3261,7 @@ L80: } /* lderes_ */ -doublereal dbeg_(reset) -logical *reset; +doublereal dbeg_(logical* reset) { /* System generated locals */ doublereal ret_val; @@ -3629,8 +3313,7 @@ L10: } /* dbeg_ */ -doublereal ddiff_(x, y) -doublereal *x, *y; +doublereal ddiff_(doublereal* x, doublereal* y) { /* System generated locals */ doublereal ret_val; diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c index 57e4707a9..1424e39b4 100644 --- a/ctest/c_sblat1c.c +++ b/ctest/c_sblat1c.c @@ -21,19 +21,6 @@ typedef float real; typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; -#ifdef _MSC_VER -static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} -static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} -static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} -static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} -#else -static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} -static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} -#endif -#define pCf(z) (*_pCf(z)) -#define pCd(z) (*_pCd(z)) typedef int logical; typedef short int shortlogical; typedef char logical1; @@ -242,250 +229,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -#if 0 -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -#if 0 -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - /* Common Block Declarations */ @@ -393,7 +270,7 @@ static logical c_true = TRUE_; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program MAIN__() */ int main() +/* Main program MAIN__() */ int main(void) { /* Initialized data */ @@ -402,26 +279,25 @@ static logical c_false = FALSE_; /* System generated locals */ integer i__1, i__2, i__3; real r__1; - /* Builtin functions */ - integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), - e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe(); - integer f_clos(); /* Local variables */ static integer nalf, idim[9]; static logical same; static integer nbet, ntra; static logical rewi; - extern /* Subroutine */ int schk1_(), schk2_(), schk3_(), schk4_(), - schk5_(); + extern /* Subroutine */ int schk1_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen); + extern /* Subroutine */ int schk2_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen); + extern /* Subroutine */ int schk3_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen); + extern /* Subroutine */ int schk4_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen); + extern /* Subroutine */ int schk5_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen); static real c__[4225] /* was [65][65] */, g[65]; static integer i__, j, n; static logical fatal; static real w[130]; - extern doublereal sdiff_(); + extern doublereal sdiff_(real*, real*); static logical trace; static integer nidim; - extern /* Subroutine */ int smmch_(); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); static char snaps[32]; static integer isnum; static logical ltest[6]; @@ -433,9 +309,9 @@ static logical c_false = FALSE_; static logical rorder; static integer layout; static logical ltestt, tsterr; - extern /* Subroutine */ int cs3chke_(); + extern /* Subroutine */ void cs3chke_(char*, ftnlen); static real alf[7], bet[7]; - extern logical lse_(); + extern logical lse_(real*, real*, integer*); static real eps, err; char tmpchar; @@ -899,21 +775,7 @@ L230: } /* MAIN__ */ -/* Subroutine */ int schk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -real *alf; -integer *nbet; -real *bet; -integer *nmax; -real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int schk1_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -923,8 +785,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static real beta; @@ -936,18 +796,17 @@ ftnlen sname_len; static logical trana, tranb; static integer nargs; static logical reset; - extern /* Subroutine */ void sprcn1_(); - extern /* Subroutine */ int smake_(); - extern /* Subroutine */ int smmch_(); + extern /* Subroutine */ void sprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); static integer ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; - extern /* Subroutine */ int csgemm_(); + extern /* Subroutine */ void csgemm_(integer*, char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen); static char tranas[1], tranbs[1], transa[1], transb[1]; static real errmax; - extern logical lseres_(); - extern logical lse_(); + extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern logical lse_(real*, real*, integer*); static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; static real als, bls; - extern logical lse_(); static real err; /* Tests SGEMM. */ @@ -1278,23 +1137,8 @@ L130: -/* Subroutine */ void sprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k, - alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *transa, *transb; -integer *m, *n, *k; -real *alpha; -integer *lda, *ldb; -real *beta; -integer *ldc; -ftnlen sname_len; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ void sprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char crc[14], cta[14], ctb[14]; @@ -1324,21 +1168,7 @@ ftnlen transb_len; } /* sprcn1_ */ -/* Subroutine */ int schk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -real *alf; -integer *nbet; -real *bet; -integer *nmax; -real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int schk2_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1349,8 +1179,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static real beta; @@ -1368,15 +1196,15 @@ ftnlen sname_len; static char uplos[1]; static integer ia, ib, na, nc, im, in, ms, ns; static real errmax; - extern logical lseres_(); - extern /* Subroutine */ int cssymm_(); - extern void sprcn2_(); - extern int smake_(); - extern int smmch_(); + extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cssymm_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern void sprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); static integer laa, lbb, lda, lcc, ldb, ldc, ics; static real als, bls; static integer icu; - extern logical lse_(); + extern logical lse_(real*, real*, integer*); static real err; /* Tests SSYMM. */ @@ -1685,23 +1513,8 @@ L120: } /* schk2_ */ -/* Subroutine */ void sprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha, - lda, ldb, beta, ldc, sname_len, side_len, uplo_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo; -integer *m, *n; -real *alpha; -integer *lda, *ldb; -real *beta; -integer *ldc; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; +/* Subroutine */ void sprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char cs[14], cu[14], crc[14]; @@ -1726,19 +1539,7 @@ ftnlen uplo_len; } /* sprcn2_ */ -/* Subroutine */ int schk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__, - iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -real *alf; -integer *nmax; -real *a, *aa, *as, *b, *bb, *bs, *ct, *g, *c__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int schk3_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* ct, real* g, real* c__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1751,8 +1552,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static char diag[1]; @@ -1769,18 +1568,19 @@ ftnlen sname_len; static integer nargs; static logical reset; static char uplos[1]; - extern /* Subroutine */ void sprcn3_(); + extern /* Subroutine */ void sprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, real*, integer*, integer*, ftnlen , ftnlen, ftnlen, ftnlen, ftnlen); static integer ia, na, nc, im, in, ms, ns; static char tranas[1], transa[1]; static real errmax; - extern int smake_(); - extern int smmch_(); - extern logical lseres_(); - extern /* Subroutine */ int cstrmm_(), cstrsm_(); + extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); + extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cstrmm_(integer*, char*, char*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cstrsm_(integer*, char*, char*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); static integer laa, icd, lbb, lda, ldb, ics; static real als; static integer ict, icu; - extern logical lse_(); + extern logical lse_(real*, real*, integer*); static real err; /* Tests STRMM and STRSM. */ @@ -2155,24 +1955,8 @@ L160: } /* schk3_ */ -/* Subroutine */ void sprcn3_(nout, nc, sname, iorder, side, uplo, transa, - diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, - transa_len, diag_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo, *transa, *diag; -integer *m, *n; -real *alpha; -integer *lda, *ldb; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; -ftnlen transa_len; -ftnlen diag_len; +/* Subroutine */ void sprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, real* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cd[14], cs[14], cu[14], crc[14]; @@ -2210,21 +1994,7 @@ ftnlen diag_len; } /* sprcn3_ */ -/* Subroutine */ int schk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -real *alf; -integer *nbet; -real *bet; -integer *nmax; -real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int schk4_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2235,8 +2005,6 @@ ftnlen sname_len; integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static real beta; @@ -2253,18 +2021,18 @@ ftnlen sname_len; static char trans[1]; static logical upper; static char uplos[1]; - extern /* Subroutine */ void sprcn4_(); - extern /* Subroutine */ int smake_(); - extern /* Subroutine */ int smmch_(); + extern /* Subroutine */ void sprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns; static real errmax; - extern logical lseres_(); + extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen); static char transs[1]; - extern /* Subroutine */ int cssyrk_(); + extern /* Subroutine */ void cssyrk_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen); static integer laa, lda, lcc, ldc; static real als; static integer ict, icu; - extern logical lse_(); + extern logical lse_(real*, real*, integer*); static real err; /* Tests SSYRK. */ @@ -2575,23 +2343,8 @@ L130: } /* schk4_ */ -/* Subroutine */ void sprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -real *alpha; -integer *lda; -real *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ void sprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, real* alpha, integer* lda, real* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -2619,21 +2372,7 @@ ftnlen transa_len; } /* sprcn4_ */ -/* Subroutine */ int schk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, - c__, cc, cs, ct, g, w, iorder, sname_len) -char *sname; -real *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -real *alf; -integer *nbet; -real *bet; -integer *nmax; -real *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct, *g, *w; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int schk5_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* ab, real* aa, real* as, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, real* w, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2643,8 +2382,6 @@ ftnlen sname_len; /* System generated locals */ integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; - /* Builtin functions */ - integer f_rew(), s_wsfe(), e_wsfe(), do_fio(); /* Local variables */ static integer jjab; @@ -2663,18 +2400,18 @@ ftnlen sname_len; static logical upper; static char uplos[1]; static integer ia, ib; - extern /* Subroutine */ void sprcn5_(); + extern /* Subroutine */ void sprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen); static integer jc, ma, na, nc, ik, in, jj, lj, ks, ns; static real errmax; - extern logical lseres_(); - extern int smake_(); + extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen); static char transs[1]; static integer laa, lbb, lda, lcc, ldb, ldc; static real als; static integer ict, icu; - extern /* Subroutine */ int cssyr2k_(); - extern logical lse_(); - extern int smmch_(); + extern /* Subroutine */ void cssyr2k_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen); + extern logical lse_(real*, real*, integer*); + extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen); static real err; /* Tests SSYR2K. */ @@ -3037,23 +2774,8 @@ L160: } /* schk5_ */ -/* Subroutine */ void sprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -real *alpha; -integer *lda, *ldb; -real *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ void sprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Builtin functions */ - integer s_wsfe(), do_fio(), e_wsfe(); /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -3081,19 +2803,7 @@ ftnlen transa_len; } /* sprcn5_ */ -/* Subroutine */ int smake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset, - transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -real *a; -integer *nmax; -real *aa; -integer *lda; -logical *reset; -real *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int smake_(char* type__, char* uplo, char* diag, integer* m, integer* n, real* a, integer* nmax, real* aa, integer* lda, logical* reset, real* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; @@ -3102,7 +2812,7 @@ ftnlen diag_len; /* Local variables */ static integer ibeg, iend; - extern doublereal sbeg_(); + extern doublereal sbeg_(logical*); static logical unit; static integer i__, j; static logical lower, upper, gen, tri, sym; @@ -3233,25 +2943,7 @@ ftnlen diag_len; } /* smake_ */ -/* Subroutine */ int smmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, - beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, - transa_len, transb_len) -char *transa, *transb; -integer *m, *n, *kk; -real *alpha, *a; -integer *lda; -real *b; -integer *ldb; -real *beta, *c__; -integer *ldc; -real *ct, *g, *cc; -integer *ldcc; -real *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ int smmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, real* alpha, real* a, integer* lda, real* b, integer* ldb, real* beta, real* c__, integer* ldc, real* ct, real* g, real* cc, integer* ldcc, real* eps, real* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len) { /* System generated locals */ @@ -3260,8 +2952,7 @@ ftnlen transb_len; real r__1, r__2; /* Builtin functions */ - double sqrt(); - integer s_wsfe(), e_wsfe(), do_fio(); + double sqrt(double); /* Local variables */ static real erri; @@ -3426,9 +3117,7 @@ L150: } /* smmch_ */ -logical lse_(ri, rj, lr) -real *ri, *rj; -integer *lr; +logical lse_(real* ri, real* rj, integer* lr) { /* System generated locals */ integer i__1; @@ -3475,13 +3164,7 @@ L30: } /* lse_ */ -logical lseres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -real *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lseres_(char* type__, char* uplo, integer* m, integer* n, real* aa, real* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2; @@ -3572,8 +3255,7 @@ L80: } /* lseres_ */ -doublereal sbeg_(reset) -logical *reset; +doublereal sbeg_(logical* reset) { /* System generated locals */ real ret_val; @@ -3625,8 +3307,7 @@ L10: } /* sbeg_ */ -doublereal sdiff_(x, y) -real *x, *y; +doublereal sdiff_(real* x, real* y) { /* System generated locals */ real ret_val; diff --git a/ctest/c_zblat1c.c b/ctest/c_zblat1c.c index d5b080633..4761e63d7 100644 --- a/ctest/c_zblat1c.c +++ b/ctest/c_zblat1c.c @@ -242,250 +242,6 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -#if 0 -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif -/* -- translated by f2c (version 20000121). - You must link the resulting object file with the libraries: - -lf2c -lm (in that order) -*/ - /* Common Block Declarations */ @@ -396,7 +273,7 @@ static integer c_n1 = -1; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program */ int main() +/* Main program */ int main(void) { /* Initialized data */ @@ -414,19 +291,23 @@ static logical c_false = FALSE_; static logical same; static integer ninc, nbet, ntra; static logical rewi; - extern /* Subroutine */ int zchk1_(), zchk2_(), zchk3_(), zchk4_(), - zchk5_(), zchk6_(); + extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ int zchk6_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); static doublecomplex a[4225] /* was [65][65] */; static doublereal g[65]; static integer i__, j; - extern doublereal ddiff_(); + extern doublereal ddiff_(doublereal*, doublereal*); static integer n; static logical fatal; static doublecomplex x[65], y[65], z__[130]; static logical trace; static integer nidim; static char snaps[32], trans[1]; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer isnum; static logical ltest[17]; static doublecomplex aa[4225]; @@ -441,12 +322,12 @@ static logical c_false = FALSE_; static logical rorder; static integer layout; static logical ltestt, tsterr; - extern /* Subroutine */ int cz2chke_(); + extern /* Subroutine */ void cz2chke_(char*, ftnlen); static doublecomplex alf[7]; static integer inc[7], nkb; static doublecomplex bet[7]; static doublereal eps, err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); char tmpchar; /* Test program for the DOUBLE PRECISION COMPLEX Level 2 Blas. */ @@ -984,22 +865,7 @@ L240: } /* MAIN__ */ -/* Subroutine */ int zchk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -doublereal *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1018,27 +884,27 @@ ftnlen sname_len; static integer i__, m, n; static doublecomplex alpha; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; static logical reset; static integer incxs, incys; static char trans[1]; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer ia, ib, ic; static logical banded; static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns; - extern /* Subroutine */ int czgbmv_(); + extern /* Subroutine */ void czgbmv_(integer*, char*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen); static char ctrans[14]; - extern /* Subroutine */ int czgemv_(); + extern /* Subroutine */ void czgemv_(integer*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen); static doublereal errmax; static doublecomplex transl; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static char transs[1]; static integer laa, lda; static doublecomplex als, bls; static doublereal err; static integer iku, kls; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); static integer kus; @@ -1451,22 +1317,7 @@ L140: } /* zchk1_ */ -/* Subroutine */ int zchk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, - incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -doublereal *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1486,27 +1337,28 @@ ftnlen sname_len; static integer i__, k, n; static doublecomplex alpha; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; static integer incxs, incys; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static char uplos[1]; static integer ia, ib, ic; static logical banded; static integer nc, ik, in; static logical packed; static integer nk, ks, ix, iy, ns, lx, ly; - extern /* Subroutine */ int czhbmv_(), czhemv_(); + extern /* Subroutine */ void czhbmv_(integer*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ void czhemv_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen); static doublereal errmax; static doublecomplex transl; - extern logical lzeres_(); - extern /* Subroutine */ int czhpmv_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void czhpmv_(integer*, char*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen); static integer laa, lda; static doublecomplex als, bls; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests CHEMV, CHBMV and CHPMV. */ @@ -1909,19 +1761,7 @@ L130: } /* zchk2_ */ -/* Subroutine */ int zchk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, xt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *xt; -doublereal *g; -doublecomplex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* xt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1942,13 +1782,13 @@ ftnlen sname_len; static integer i__, k, n; static char diags[1]; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; static integer incxs; static char trans[1]; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static char uplos[1]; static logical banded; static integer nc, ik, in; @@ -1957,14 +1797,17 @@ ftnlen sname_len; static char ctrans[14]; static doublereal errmax; static doublecomplex transl; - extern logical lzeres_(); - extern /* Subroutine */ int cztbmv_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cztbmv_(integer*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); static char transs[1]; - extern /* Subroutine */ int cztbsv_(), cztpmv_(), cztrmv_(), cztpsv_(), - cztrsv_(); + extern /* Subroutine */ void cztbsv_(integer*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztpmv_(integer*, char*, char*, char*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztpsv_(integer*, char*, char*, char*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztrmv_(integer*, char*, char*, char*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztrsv_(integer*, char*, char*, char*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); static integer laa, icd, lda, ict, icu; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); @@ -2422,21 +2265,7 @@ L130: } /* zchk3_ */ -/* Subroutine */ int zchk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -doublereal *g; -doublecomplex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; @@ -2450,21 +2279,21 @@ ftnlen sname_len; static integer i__, j, m, n; static doublecomplex alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; static logical reset; static integer incxs, incys; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static integer ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly; - extern /* Subroutine */ int czgerc_(); + extern /* Subroutine */ void czgerc_(integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*); static doublereal errmax; - extern /* Subroutine */ int czgeru_(); + extern /* Subroutine */ void czgeru_(integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*); static doublecomplex transl; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lda; static doublecomplex als; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); @@ -2793,21 +2622,7 @@ L150: } /* zchk4_ */ -/* Subroutine */ int zchk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -doublereal *g; -doublecomplex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2827,13 +2642,14 @@ ftnlen sname_len; static integer i__, j, n; static doublecomplex alpha, w[1]; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int czher_(); + extern /* Subroutine */ void czher_(integer*, char*, integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen); static logical reset; static char cuplo[14]; static integer incxs; - extern /* Subroutine */ int czhpr_(), zmvch_(); + extern /* Subroutine */ void czhpr_(integer*, char*, integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, ftnlen); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static logical upper; static char uplos[1]; static integer ia, ja, ic, nc, jj, lj, in; @@ -2841,10 +2657,10 @@ ftnlen sname_len; static integer ix, ns, lx; static doublereal ralpha, errmax; static doublecomplex transl; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lda; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHER and ZHPR. */ @@ -3167,21 +2983,7 @@ L130: } /* zchk5_ */ -/* Subroutine */ int zchk6_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, - xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *ninc, *inc, *nmax, *incmax; -doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt; -doublereal *g; -doublecomplex *z__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk6_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -3201,25 +3003,26 @@ ftnlen sname_len; static integer i__, j, n; static doublecomplex alpha, w[2]; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; static logical reset; static char cuplo[14]; static integer incxs, incys; - extern /* Subroutine */ int zmvch_(); + extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen); static logical upper; static char uplos[1]; - extern /* Subroutine */ int czher2_(), czhpr2_(); + extern /* Subroutine */ void czher2_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ void czhpr2_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, ftnlen); static integer ia, ja, ic, nc, jj, lj, in; static logical packed; static integer ix, iy, ns, lx, ly; static doublereal errmax; static doublecomplex transl; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lda; static doublecomplex als; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHER2 and ZHPR2. */ @@ -3604,24 +3407,7 @@ L170: } /* zchk6_ */ -/* Subroutine */ int zmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, - incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len) -char *trans; -integer *m, *n; -doublecomplex *alpha, *a; -integer *nmax; -doublecomplex *x; -integer *incx; -doublecomplex *beta, *y; -integer *incy; -doublecomplex *yt; -doublereal *g; -doublecomplex *yy; -doublereal *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen trans_len; +/* Subroutine */ int zmvch_(char* trans, integer* m, integer* n, doublecomplex* alpha, doublecomplex* a, integer* nmax, doublecomplex* x, integer* incx, doublecomplex* beta, doublecomplex* y, integer* incy, doublecomplex* yt, doublereal* g, doublecomplex* yy, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len) { /* System generated locals */ @@ -3819,9 +3605,7 @@ L80: } /* zmvch_ */ -logical lze_(ri, rj, lr) -doublecomplex *ri, *rj; -integer *lr; +logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr) { /* System generated locals */ integer i__1, i__2, i__3; @@ -3868,13 +3652,7 @@ L30: } /* lze_ */ -logical lzeres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -doublecomplex *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex* aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; @@ -3967,9 +3745,7 @@ L80: } /* lzeres_ */ -/* Double Complex */ VOID zbeg_( ret_val, reset) -doublecomplex * ret_val; -logical *reset; +/* Double Complex */ VOID zbeg_( doublecomplex* ret_val, logical* reset) { /* System generated locals */ doublereal d__1, d__2; @@ -4030,8 +3806,7 @@ L10: } /* zbeg_ */ -doublereal ddiff_(x, y) -doublereal *x, *y; +doublereal ddiff_(doublereal* x, doublereal* y) { /* System generated locals */ doublereal ret_val; @@ -4051,19 +3826,7 @@ doublereal *x, *y; } /* ddiff_ */ -/* Subroutine */ int zmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, - ku, reset, transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -doublecomplex *a; -integer *nmax; -doublecomplex *aa; -integer *lda, *kl, *ku; -logical *reset; -doublecomplex *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, integer* kl, integer* ku, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; @@ -4072,7 +3835,7 @@ ftnlen diag_len; /* Local variables */ static integer ibeg, iend, ioff; - extern /* Double Complex */ VOID zbeg_(); + extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*); static logical unit; static integer i__, j; static logical lower; diff --git a/ctest/c_zblat3c.c b/ctest/c_zblat3c.c index eca2c3ff6..6025c0052 100644 --- a/ctest/c_zblat3c.c +++ b/ctest/c_zblat3c.c @@ -22,14 +22,11 @@ typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; #ifdef _MSC_VER -static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} -static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} #else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} #endif #define pCf(z) (*_pCf(z)) @@ -242,124 +239,7 @@ typedef struct Namelist Namelist; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -#if 0 -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -#endif + /* Common Block Declarations */ @@ -388,7 +268,7 @@ static logical c_true = TRUE_; static integer c__0 = 0; static logical c_false = FALSE_; -/* Main program MAIN__() */ int main() +/* Main program MAIN__() */ int main(void) { /* Initialized data */ @@ -400,26 +280,29 @@ static logical c_false = FALSE_; doublereal d__1; /* Builtin functions */ - integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), - e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe(); + integer s_rsle(void), do_lio(void), e_rsle(void), f_open(void), s_wsfe(void), do_fio(void), + e_wsfe(void), s_wsle(void), e_wsle(void), s_rsfe(void), e_rsfe(void); /* Local variables */ static integer nalf, idim[9]; static logical same; static integer nbet, ntra; static logical rewi; - extern /* Subroutine */ int zchk1_(), zchk2_(), zchk3_(), zchk4_(), - zchk5_(); + extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); + extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen); + extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen); static doublecomplex c__[4225] /* was [65][65] */; static doublereal g[65]; static integer i__, j; - extern doublereal ddiff_(); + extern doublereal ddiff_(doublereal*, doublereal*); static integer n; static logical fatal; static doublecomplex w[130]; static logical trace; static integer nidim; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static char snaps[32]; static integer isnum; static logical ltest[9]; @@ -431,10 +314,10 @@ static logical c_false = FALSE_; static logical rorder; static integer layout; static logical ltestt, tsterr; - extern /* Subroutine */ int cz3chke_(); + extern /* Subroutine */ int cz3chke_(char*, ftnlen); static doublecomplex alf[7], bet[7]; static doublereal eps, err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); char tmpchar; /* Test program for the COMPLEX*16 Level 3 Blas. */ @@ -924,22 +807,7 @@ L230: } /* MAIN__ */ -/* Subroutine */ int zchk1_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *nmax; -doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct; -doublereal *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -956,21 +824,21 @@ ftnlen sname_len; static integer i__, k, m, n; static doublecomplex alpha; static logical isame[13], trana, tranb; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical reset; static integer ia, ib; - extern /* Subroutine */ int zprcn1_(); + extern /* Subroutine */ int zprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); static integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; - extern /* Subroutine */ int czgemm_(); + extern /* Subroutine */ void czgemm_(integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static char tranas[1], tranbs[1], transa[1], transb[1]; static doublereal errmax; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; static doublecomplex als, bls; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZGEMM. */ @@ -1313,20 +1181,7 @@ L130: } /* zchk1_ */ -/* Subroutine */ int zprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k, - alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *transa, *transb; -integer *m, *n, *k; -doublecomplex *alpha; -integer *lda, *ldb; -doublecomplex *beta; -integer *ldc; -ftnlen sname_len; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ int zprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len) { /* Local variables */ static char crc[14], cta[14], ctb[14]; @@ -1357,22 +1212,7 @@ return 0; } /* zprcn1_ */ -/* Subroutine */ int zchk2_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *nmax; -doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct; -doublereal *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1394,23 +1234,23 @@ ftnlen sname_len; static doublecomplex alpha; static logical isame[13]; static char sides[1]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical reset; static char uplos[1]; static integer ia, ib; - extern /* Subroutine */ int zprcn2_(); + extern /* Subroutine */ int zprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); static integer na, nc, im, in, ms, ns; - extern /* Subroutine */ int czhemm_(); + extern /* Subroutine */ void czhemm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static doublereal errmax; - extern logical lzeres_(); - extern /* Subroutine */ int czsymm_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void czsymm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lbb, lda, lcc, ldb, ldc, ics; static doublecomplex als, bls; static integer icu; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHEMM and ZSYMM. */ @@ -1737,20 +1577,7 @@ L120: } /* zchk2_ */ -/* Subroutine */ int zprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha, - lda, ldb, beta, ldc, sname_len, side_len, uplo_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo; -integer *m, *n; -doublecomplex *alpha; -integer *lda, *ldb; -doublecomplex *beta; -integer *ldc; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; +/* Subroutine */ int zprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len) { /* Local variables */ static char cs[14], cu[14], crc[14]; @@ -1777,21 +1604,7 @@ return 0; } /* zprcn2_ */ -/* Subroutine */ int zchk3_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__, - iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *nmax; -doublecomplex *a, *aa, *as, *b, *bb, *bs, *ct; -doublereal *g; -doublecomplex *c__; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* ct, doublereal* g, doublecomplex* c__, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -1817,23 +1630,24 @@ ftnlen sname_len; static char diags[1]; static logical isame[13]; static char sides[1]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static logical reset; static char uplos[1]; static integer ia, na; - extern /* Subroutine */ int zprcn3_(); + extern /* Subroutine */ int zprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen); static integer nc, im, in, ms, ns; static char tranas[1], transa[1]; static doublereal errmax; - extern logical lzeres_(); - extern /* Subroutine */ int cztrmm_(), cztrsm_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cztrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); static integer laa, icd, lbb, lda, ldb, ics; static doublecomplex als; static integer ict, icu; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZTRMM and ZTRSM. */ @@ -2227,21 +2041,7 @@ L160: } /* zchk3_ */ -/* Subroutine */ int zprcn3_(nout, nc, sname, iorder, side, uplo, transa, - diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, - transa_len, diag_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *side, *uplo, *transa, *diag; -integer *m, *n; -doublecomplex *alpha; -integer *lda, *ldb; -ftnlen sname_len; -ftnlen side_len; -ftnlen uplo_len; -ftnlen transa_len; -ftnlen diag_len; +/* Subroutine */ int zprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len) { /* Local variables */ @@ -2281,22 +2081,7 @@ return 0; } /* zprcn3_ */ -/* Subroutine */ int zchk4_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, - c__, cc, cs, ct, g, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *nmax; -doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct; -doublereal *g; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2320,30 +2105,30 @@ ftnlen sname_len; static doublecomplex alpha; static doublereal rbeta; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static doublereal rbets; static logical reset; static char trans[1]; static logical upper; static char uplos[1]; static integer ia, ib, jc, ma, na; - extern /* Subroutine */ int zprcn4_(); + extern /* Subroutine */ int zprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); static integer nc; - extern /* Subroutine */ int zprcn6_(); + extern /* Subroutine */ int zprcn6_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ik, in, jj, lj, ks, ns; static doublereal ralpha; - extern /* Subroutine */ int czherk_(); + extern /* Subroutine */ int czherk_(integer*, char*, char*, integer*, integer*, doublereal*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); static doublereal errmax; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static char transs[1], transt[1]; - extern /* Subroutine */ int czsyrk_(); + extern /* Subroutine */ int czsyrk_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lda, lcc, ldc; static doublecomplex als; static integer ict, icu; static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHERK and ZSYRK. */ @@ -2732,20 +2517,7 @@ L130: } /* zchk4_ */ -/* Subroutine */ int zprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublecomplex *alpha; -integer *lda; -doublecomplex *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ int zprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -2775,20 +2547,7 @@ return 0; -/* Subroutine */ int zprcn6_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublereal *alpha; -integer *lda; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ int zprcn6_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { /* Local variables */ @@ -2818,23 +2577,7 @@ return 0; } /* zprcn6_ */ -/* Subroutine */ int zchk5_(sname, eps, thresh, nout, ntra, trace, rewi, - fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, - c__, cc, cs, ct, g, w, iorder, sname_len) -char *sname; -doublereal *eps, *thresh; -integer *nout, *ntra; -logical *trace, *rewi, *fatal; -integer *nidim, *idim, *nalf; -doublecomplex *alf; -integer *nbet; -doublecomplex *bet; -integer *nmax; -doublecomplex *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct; -doublereal *g; -doublecomplex *w; -integer *iorder; -ftnlen sname_len; +/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* ab, doublecomplex* aa, doublecomplex* as, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, doublecomplex* w, integer* iorder, ftnlen sname_len) { /* Initialized data */ @@ -2857,27 +2600,28 @@ ftnlen sname_len; static doublecomplex alpha; static doublereal rbeta; static logical isame[13]; - extern /* Subroutine */ int zmake_(); + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); static integer nargs; - extern /* Subroutine */ int zmmch_(); + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); static doublereal rbets; static logical reset; static char trans[1]; static logical upper; static char uplos[1]; static integer ia, ib, jc, ma, na, nc; - extern /* Subroutine */ int zprcn5_(), zprcn7_(); + extern /* Subroutine */ int zprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int zprcn7_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); static integer ik, in, jj, lj, ks, ns; static doublereal errmax; - extern logical lzeres_(); + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static char transs[1], transt[1]; - extern /* Subroutine */ int czher2k_(); + extern /* Subroutine */ int czher2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); static integer laa, lbb, lda, lcc, ldb, ldc; static doublecomplex als; static integer ict, icu; - extern /* Subroutine */ int czsyr2k_(); + extern /* Subroutine */ int czsyr2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); static doublereal err; - extern logical lze_(); + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHER2K and ZSYR2K. */ @@ -3349,20 +3093,7 @@ L160: } /* zchk5_ */ -/* Subroutine */ int zprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublecomplex *alpha; -integer *lda, *ldb; -doublecomplex *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ int zprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { /* Local variables */ static char ca[14], cu[14], crc[14]; @@ -3392,20 +3123,7 @@ return 0; -/* Subroutine */ int zprcn7_(nout, nc, sname, iorder, uplo, transa, n, k, - alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len) -integer *nout, *nc; -char *sname; -integer *iorder; -char *uplo, *transa; -integer *n, *k; -doublecomplex *alpha; -integer *lda, *ldb; -doublereal *beta; -integer *ldc; -ftnlen sname_len; -ftnlen uplo_len; -ftnlen transa_len; +/* Subroutine */ int zprcn7_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { /* Local variables */ @@ -3435,19 +3153,7 @@ return 0; } /* zprcn7_ */ -/* Subroutine */ int zmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset, - transl, type_len, uplo_len, diag_len) -char *type__, *uplo, *diag; -integer *m, *n; -doublecomplex *a; -integer *nmax; -doublecomplex *aa; -integer *lda; -logical *reset; -doublecomplex *transl; -ftnlen type_len; -ftnlen uplo_len; -ftnlen diag_len; +/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; @@ -3456,7 +3162,7 @@ ftnlen diag_len; /* Local variables */ static integer ibeg, iend; - extern /* Double Complex */ VOID zbeg_(); + extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*); static logical unit; static integer i__, j; static logical lower, upper; @@ -3629,27 +3335,7 @@ ftnlen diag_len; } /* zmake_ */ -/* Subroutine */ int zmmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, - beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, - transa_len, transb_len) -char *transa, *transb; -integer *m, *n, *kk; -doublecomplex *alpha, *a; -integer *lda; -doublecomplex *b; -integer *ldb; -doublecomplex *beta, *c__; -integer *ldc; -doublecomplex *ct; -doublereal *g; -doublecomplex *cc; -integer *ldcc; -doublereal *eps, *err; -logical *fatal; -integer *nout; -logical *mv; -ftnlen transa_len; -ftnlen transb_len; +/* Subroutine */ int zmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublecomplex* alpha, doublecomplex* a, integer* lda, doublecomplex* b, integer* ldb, doublecomplex* beta, doublecomplex* c__, integer* ldc, doublecomplex* ct, doublereal* g, doublecomplex* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len) { /* System generated locals */ @@ -3658,7 +3344,7 @@ ftnlen transb_len; doublereal d__1, d__2, d__3, d__4, d__5, d__6; doublecomplex z__1, z__2, z__3, z__4; - double sqrt(); + double sqrt(double); /* Local variables */ static doublereal erri; static integer i__, j, k; @@ -4031,9 +3717,7 @@ L250: } /* zmmch_ */ -logical lze_(ri, rj, lr) -doublecomplex *ri, *rj; -integer *lr; +logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr) { /* System generated locals */ integer i__1, i__2, i__3; @@ -4082,13 +3766,7 @@ L30: } /* lze_ */ -logical lzeres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len) -char *type__, *uplo; -integer *m, *n; -doublecomplex *aa, *as; -integer *lda; -ftnlen type_len; -ftnlen uplo_len; +logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex *aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; @@ -4184,9 +3862,7 @@ L80: } /* lzeres_ */ -/* Double Complex */ VOID zbeg_( ret_val, reset) -doublecomplex * ret_val; -logical *reset; +/* Double Complex */ VOID zbeg_(doublecomplex* ret_val, logical* reset) { /* System generated locals */ doublereal d__1, d__2; @@ -4249,8 +3925,7 @@ L10: } /* zbeg_ */ -doublereal ddiff_(x, y) -doublereal *x, *y; +doublereal ddiff_(doublereal* x, doublereal* y) { /* System generated locals */ doublereal ret_val; From 4041b7fb42dcba67d99a0e00e1e820b6cc29f7fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Oct 2023 22:33:08 +0200 Subject: [PATCH 346/718] fix function prototypes in f2c-converted files --- lapack-netlib/INSTALL/dlamch.c | 230 +-------------------- lapack-netlib/INSTALL/droundup_lwork.c | 141 ------------- lapack-netlib/INSTALL/dsecnd_INT_ETIME.c | 144 +------------- lapack-netlib/INSTALL/ilaver.c | 243 +---------------------- lapack-netlib/INSTALL/second_INT_ETIME.c | 144 +------------- lapack-netlib/INSTALL/slamch.c | 229 --------------------- lapack-netlib/INSTALL/sroundup_lwork.c | 141 ------------- 7 files changed, 6 insertions(+), 1266 deletions(-) diff --git a/lapack-netlib/INSTALL/dlamch.c b/lapack-netlib/INSTALL/dlamch.c index 744130a87..ce6b76a32 100644 --- a/lapack-netlib/INSTALL/dlamch.c +++ b/lapack-netlib/INSTALL/dlamch.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,24 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} static double dpow_ui(double x, integer n) { double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,217 +273,7 @@ static double dpow_ui(double x, integer n) { } return pow; } -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; - _Complex float zdotc = 0.0; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; - _Complex float zdotc = 0.0; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; - _Complex float zdotc = 0.0; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; - _Complex float zdotc = 0.0; - if (incx == 1 && incy == 1) { - for (i=0;i Date: Sat, 7 Oct 2023 22:36:29 +0200 Subject: [PATCH 347/718] fix function prototypes in f2c-converted files --- lapack-netlib/TESTING/MATGEN/clagge.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/claghe.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clagsy.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clahilb.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clakf2.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clarge.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clarnd.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/claror.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clarot.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clatm1.c | 235 ----------------------- lapack-netlib/TESTING/MATGEN/clatm2.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clatm3.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clatm5.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clatm6.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clatme.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clatmr.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clatms.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/clatmt.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlagge.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlagsy.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlahilb.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlakf2.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlaran.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlarge.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlarnd.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlaror.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlarot.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlatm1.c | 235 ----------------------- lapack-netlib/TESTING/MATGEN/dlatm2.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlatm3.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlatm5.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlatm6.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlatm7.c | 235 ----------------------- lapack-netlib/TESTING/MATGEN/dlatme.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlatmr.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlatms.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/dlatmt.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slagge.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slagsy.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slahilb.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slakf2.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slaran.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slarge.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slarnd.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slaror.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slarot.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slatm1.c | 235 ----------------------- lapack-netlib/TESTING/MATGEN/slatm2.c | 247 ------------------------ lapack-netlib/TESTING/MATGEN/slatm3.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slatm5.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slatm6.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slatm7.c | 235 ----------------------- lapack-netlib/TESTING/MATGEN/slatme.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slatmr.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slatms.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/slatmt.c | 248 ------------------------ lapack-netlib/TESTING/MATGEN/zlagge.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlaghe.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlagsy.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlahilb.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlakf2.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlarge.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlarnd.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlaror.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlarot.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlatm1.c | 236 ----------------------- lapack-netlib/TESTING/MATGEN/zlatm2.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlatm3.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlatm5.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlatm6.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlatme.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlatmr.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlatms.c | 249 ------------------------- lapack-netlib/TESTING/MATGEN/zlatmt.c | 249 ------------------------- 74 files changed, 18291 deletions(-) diff --git a/lapack-netlib/TESTING/MATGEN/clagge.c b/lapack-netlib/TESTING/MATGEN/clagge.c index f05905bd7..62c33d01e 100644 --- a/lapack-netlib/TESTING/MATGEN/clagge.c +++ b/lapack-netlib/TESTING/MATGEN/clagge.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b CLARND */ diff --git a/lapack-netlib/TESTING/MATGEN/claror.c b/lapack-netlib/TESTING/MATGEN/claror.c index cd0d15300..b0d73f37c 100644 --- a/lapack-netlib/TESTING/MATGEN/claror.c +++ b/lapack-netlib/TESTING/MATGEN/claror.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b CLATM2 */ diff --git a/lapack-netlib/TESTING/MATGEN/clatm3.c b/lapack-netlib/TESTING/MATGEN/clatm3.c index 58cd4e551..fcd8dbfcb 100644 --- a/lapack-netlib/TESTING/MATGEN/clatm3.c +++ b/lapack-netlib/TESTING/MATGEN/clatm3.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b CLATM3 */ diff --git a/lapack-netlib/TESTING/MATGEN/clatm5.c b/lapack-netlib/TESTING/MATGEN/clatm5.c index c2b81ccf3..8fbc1c0a6 100644 --- a/lapack-netlib/TESTING/MATGEN/clatm5.c +++ b/lapack-netlib/TESTING/MATGEN/clatm5.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLARAN */ diff --git a/lapack-netlib/TESTING/MATGEN/dlarge.c b/lapack-netlib/TESTING/MATGEN/dlarge.c index 5d8a81387..5cc7fbce8 100644 --- a/lapack-netlib/TESTING/MATGEN/dlarge.c +++ b/lapack-netlib/TESTING/MATGEN/dlarge.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLARND */ diff --git a/lapack-netlib/TESTING/MATGEN/dlaror.c b/lapack-netlib/TESTING/MATGEN/dlaror.c index d9e2e46ae..fdd126174 100644 --- a/lapack-netlib/TESTING/MATGEN/dlaror.c +++ b/lapack-netlib/TESTING/MATGEN/dlaror.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} static double dpow_ui(double x, integer n) { double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,223 +273,6 @@ static double dpow_ui(double x, integer n) { } return pow; } -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLATM1 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatm2.c b/lapack-netlib/TESTING/MATGEN/dlatm2.c index d74bc9168..7491e9829 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm2.c +++ b/lapack-netlib/TESTING/MATGEN/dlatm2.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLATM2 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatm3.c b/lapack-netlib/TESTING/MATGEN/dlatm3.c index 86f964ced..a9d26c7fc 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm3.c +++ b/lapack-netlib/TESTING/MATGEN/dlatm3.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLATM3 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatm5.c b/lapack-netlib/TESTING/MATGEN/dlatm5.c index 94b49d6e3..7f1c36428 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm5.c +++ b/lapack-netlib/TESTING/MATGEN/dlatm5.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} static double dpow_ui(double x, integer n) { double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,223 +273,6 @@ static double dpow_ui(double x, integer n) { } return pow; } -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b DLATM7 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatme.c b/lapack-netlib/TESTING/MATGEN/dlatme.c index a92c70ef2..e29df164c 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatme.c +++ b/lapack-netlib/TESTING/MATGEN/dlatme.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLARAN */ diff --git a/lapack-netlib/TESTING/MATGEN/slarge.c b/lapack-netlib/TESTING/MATGEN/slarge.c index 6b37e9400..d5fbd541c 100644 --- a/lapack-netlib/TESTING/MATGEN/slarge.c +++ b/lapack-netlib/TESTING/MATGEN/slarge.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLARND */ diff --git a/lapack-netlib/TESTING/MATGEN/slaror.c b/lapack-netlib/TESTING/MATGEN/slaror.c index 48b532dfd..7e3065432 100644 --- a/lapack-netlib/TESTING/MATGEN/slaror.c +++ b/lapack-netlib/TESTING/MATGEN/slaror.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLATM1 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatm2.c b/lapack-netlib/TESTING/MATGEN/slatm2.c index e7b72006f..833ee5dea 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm2.c +++ b/lapack-netlib/TESTING/MATGEN/slatm2.c @@ -261,253 +261,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLATM2 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatm3.c b/lapack-netlib/TESTING/MATGEN/slatm3.c index 4f9f5fee2..cdf96ef51 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm3.c +++ b/lapack-netlib/TESTING/MATGEN/slatm3.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLATM3 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatm5.c b/lapack-netlib/TESTING/MATGEN/slatm5.c index 24ee0915d..9122bc041 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm5.c +++ b/lapack-netlib/TESTING/MATGEN/slatm5.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b SLATM7 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatme.c b/lapack-netlib/TESTING/MATGEN/slatme.c index a8a6b39a3..126c42121 100644 --- a/lapack-netlib/TESTING/MATGEN/slatme.c +++ b/lapack-netlib/TESTING/MATGEN/slatme.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b ZLARND */ diff --git a/lapack-netlib/TESTING/MATGEN/zlaror.c b/lapack-netlib/TESTING/MATGEN/zlaror.c index 6ada57b8a..c8a84f215 100644 --- a/lapack-netlib/TESTING/MATGEN/zlaror.c +++ b/lapack-netlib/TESTING/MATGEN/zlaror.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} static double dpow_ui(double x, integer n) { double pow=1.0; unsigned long int u; if(n != 0) { @@ -291,224 +273,6 @@ static double dpow_ui(double x, integer n) { } return pow; } -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b ZLATM2 */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatm3.c b/lapack-netlib/TESTING/MATGEN/zlatm3.c index c35ffe4d9..6370a9d39 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatm3.c +++ b/lapack-netlib/TESTING/MATGEN/zlatm3.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i \brief \b ZLATM3 */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatm5.c b/lapack-netlib/TESTING/MATGEN/zlatm5.c index 753ee0ce6..5ee6cc8ce 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatm5.c +++ b/lapack-netlib/TESTING/MATGEN/zlatm5.c @@ -248,7 +248,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -262,254 +261,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i Date: Sat, 7 Oct 2023 22:38:30 +0200 Subject: [PATCH 348/718] fix function prototypes in f2c-converted files --- lapack-netlib/SRC/DEPRECATED/cgegs.c | 241 ------------------------- lapack-netlib/SRC/DEPRECATED/cgegv.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/cgelsx.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/cgeqpf.c | 241 ------------------------- lapack-netlib/SRC/DEPRECATED/cggsvd.c | 243 +------------------------ lapack-netlib/SRC/DEPRECATED/cggsvp.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/clahrd.c | 248 -------------------------- lapack-netlib/SRC/DEPRECATED/clatzm.c | 248 -------------------------- lapack-netlib/SRC/DEPRECATED/ctzrqf.c | 247 ------------------------- lapack-netlib/SRC/DEPRECATED/dgegs.c | 242 +------------------------ lapack-netlib/SRC/DEPRECATED/dgegv.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/dgelsx.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/dgeqpf.c | 243 +------------------------ lapack-netlib/SRC/DEPRECATED/dggsvd.c | 241 ------------------------- lapack-netlib/SRC/DEPRECATED/dggsvp.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/dlahrd.c | 246 ------------------------- lapack-netlib/SRC/DEPRECATED/dlatzm.c | 248 -------------------------- lapack-netlib/SRC/DEPRECATED/dtzrqf.c | 247 ------------------------- lapack-netlib/SRC/DEPRECATED/sgegs.c | 241 ------------------------- lapack-netlib/SRC/DEPRECATED/sgegv.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/sgelsx.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/sgeqpf.c | 243 +------------------------ lapack-netlib/SRC/DEPRECATED/sggsvd.c | 241 ------------------------- lapack-netlib/SRC/DEPRECATED/sggsvp.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/slahrd.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/slatzm.c | 247 ------------------------- lapack-netlib/SRC/DEPRECATED/stzrqf.c | 248 -------------------------- lapack-netlib/SRC/DEPRECATED/zgegs.c | 241 ------------------------- lapack-netlib/SRC/DEPRECATED/zgegv.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/zgelsx.c | 242 ------------------------- lapack-netlib/SRC/DEPRECATED/zgeqpf.c | 243 +------------------------ lapack-netlib/SRC/DEPRECATED/zggsvd.c | 243 +------------------------ lapack-netlib/SRC/DEPRECATED/zggsvp.c | 244 +------------------------ lapack-netlib/SRC/DEPRECATED/zlahrd.c | 248 -------------------------- lapack-netlib/SRC/DEPRECATED/zlatzm.c | 248 -------------------------- lapack-netlib/SRC/DEPRECATED/ztzrqf.c | 248 -------------------------- 36 files changed, 7 insertions(+), 8767 deletions(-) diff --git a/lapack-netlib/SRC/DEPRECATED/cgegs.c b/lapack-netlib/SRC/DEPRECATED/cgegs.c index 35b59e683..270a05ebd 100644 --- a/lapack-netlib/SRC/DEPRECATED/cgegs.c +++ b/lapack-netlib/SRC/DEPRECATED/cgegs.c @@ -247,7 +247,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i Date: Sun, 8 Oct 2023 11:36:06 +0200 Subject: [PATCH 349/718] fix prototype of itest1 for INTERFACE64 --- ctest/c_sblat1c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c index 1424e39b4..7c049b796 100644 --- a/ctest/c_sblat1c.c +++ b/ctest/c_sblat1c.c @@ -437,7 +437,7 @@ L40: static real stemp[1], strue[8]; extern /* Subroutine */ int stest_(int*, real*,real*,real*,real*), sscaltest_(int*,real*,real*,int*); extern real sasumtest_(int*,real*,int*); - extern /* Subroutine */ int itest1_(int*,int*), stest1_(real*,real*,real*,real*); + extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(real*,real*,real*,real*); static real sx[8]; static integer np1; extern integer isamaxtest_(int*,real*,int*); From 2b865da7304f930d44194681bd7ef86cf8edbb4c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Oct 2023 11:55:10 +0200 Subject: [PATCH 350/718] fix prototypes of stest and ctest for INTERFACE64 --- ctest/c_zblat1c.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ctest/c_zblat1c.c b/ctest/c_zblat1c.c index 4761e63d7..d8cff5dee 100644 --- a/ctest/c_zblat1c.c +++ b/ctest/c_zblat1c.c @@ -378,7 +378,7 @@ static doublereal c_b43 = 1.; /* Local variables */ static integer i__; - extern /* Subroutine */ int ctest_(int*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*); + extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*); static doublecomplex mwpcs[5], mwpct[5]; extern /* Subroutine */ int zscaltest_(int*, doublereal*, doublecomplex*, int*), itest1_(int*, int*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*); static doublecomplex cx[8]; @@ -588,7 +588,7 @@ static doublereal c_b43 = 1.; /* Local variables */ static doublecomplex cdot[1]; static integer lenx, leny, i__; - extern /* Subroutine */ int ctest_(int*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*); + extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*); static integer ksize; static doublecomplex ztemp; extern /* Subroutine */ int zdotctest_(int*, doublecomplex*, int*, doublecomplex*, int*, doublecomplex*), zcopytest_(int*, doublecomplex*, int*, doublecomplex*, int*); @@ -747,7 +747,7 @@ L40: /* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac) { static doublereal scomp[1], strue[1]; - extern /* Subroutine */ int stest_(int*,doublereal*, doublereal*, doublereal*, doublereal*); + extern /* Subroutine */ integer stest_(int*,doublereal*, doublereal*, doublereal*, doublereal*); /* ************************* STEST1 ***************************** */ From c30b53087860186287b5db0ba455499fbe7d20e5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Oct 2023 11:59:19 +0200 Subject: [PATCH 351/718] fix prototypes of ctest and itest for INTERFACE64 --- ctest/c_cblat1c.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c index 2f84da43b..fa97cbf92 100644 --- a/ctest/c_cblat1c.c +++ b/ctest/c_cblat1c.c @@ -437,9 +437,9 @@ static real c_b43 = (float)1.; /* Local variables */ static integer i__; - extern /* Subroutine */ int ctest_(int*, complex*, complex*, complex*, real*); + extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); static complex mwpcs[5], mwpct[5]; - extern /* Subroutine */ int itest1_(int*, int*), stest1_(real*,real*,real*,real*); + extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); static complex cx[8]; extern real scnrm2test_(int*, complex*, int*); static integer np1; @@ -734,7 +734,7 @@ static real c_b43 = (float)1.; static complex cdot[1]; static integer lenx, leny, i__; static complex ctemp; - extern /* Subroutine */ int ctest_(int*, complex*, complex*, complex*, real*); + extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); static integer ksize; extern /* Subroutine */ int cdotctest_(int*, complex*, int*, complex*, int*,complex*), ccopytest_(int*, complex*, int*, complex*, int*), cdotutest_(int*, complex*, int*, complex*, int*, complex*), cswaptest_(int*, complex*, int*, complex*, int*), caxpytest_(int*, complex*, complex*, int*, complex*, int*); @@ -939,7 +939,7 @@ doublereal sdiff_(real* sa, real* sb) /* Local variables */ static integer i__; static real scomp[20], ssize[20], strue[20]; - extern /* Subroutine */ int stest_(int*, real*,real*,real*,real*); + extern /* Subroutine */ int stest_(integer*, real*,real*,real*,real*); /* **************************** CTEST ***************************** */ From 769a58e9d18029cefab7647916fec3bd635b604e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Oct 2023 12:51:41 +0200 Subject: [PATCH 352/718] fix prototypes of stest and itest1 for INTERFACE64 --- ctest/c_dblat1c.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ctest/c_dblat1c.c b/ctest/c_dblat1c.c index f0141f2a5..d26cd9924 100644 --- a/ctest/c_dblat1c.c +++ b/ctest/c_dblat1c.c @@ -404,9 +404,9 @@ L40: static integer i__; extern doublereal dnrm2test_(int*, doublereal*, int*); static doublereal stemp[1], strue[8]; - extern /* Subroutine */ int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(int*,doublereal*,doublereal*,int*); + extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(int*,doublereal*,doublereal*,int*); extern doublereal dasumtest_(int*,doublereal*,int*); - extern /* Subroutine */ int itest1_(int*,int*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); + extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); static doublereal sx[8]; static integer np1; extern integer idamaxtest_(int*,doublereal*,int*); @@ -517,7 +517,7 @@ L40: static integer lenx, leny; extern doublereal ddottest_(int*,doublereal*,int*,doublereal*,int*); static integer i__, j, ksize; - extern /* Subroutine */ int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(int*,doublereal*,int*,doublereal*,int*), dswaptest_(int*,doublereal*,int*,doublereal*,int*), + extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(int*,doublereal*,int*,doublereal*,int*), dswaptest_(int*,doublereal*,int*,doublereal*,int*), daxpytest_(int*,doublereal*,doublereal*,int*,doublereal*,int*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); static integer ki, kn, mx, my; static doublereal sx[7], sy[7], stx[7], sty[7]; @@ -620,7 +620,7 @@ L40: /* Local variables */ extern /* Subroutine */ int drottest_(int*,doublereal*,int*,doublereal*,int*,doublereal*,doublereal*); static integer i__, k, ksize; - extern /* Subroutine */int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(int*,doublereal*,int*,doublereal*,int*,doublereal*); + extern /* Subroutine */int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(int*,doublereal*,int*,doublereal*,int*,doublereal*); static integer ki, kn; static doublereal dparam[5], sx[10], sy[10], stx[10], sty[10]; @@ -691,7 +691,7 @@ L40: return 0; } /* check3_ */ -/* Subroutine */ int stest_(int* len, doublereal* scomp, doublereal* strue, doublereal* ssize, doublereal* sfac) +/* Subroutine */ int stest_(integer* len, doublereal* scomp, doublereal* strue, doublereal* ssize, doublereal* sfac) { /* System generated locals */ integer i__1; @@ -758,7 +758,7 @@ L40: /* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac) { static doublereal scomp[1], strue[1]; - extern /* Subroutine */ int stest_(int*, doublereal*, doublereal*, doublereal*, doublereal*); + extern /* Subroutine */ int stest_(integer*, doublereal*, doublereal*, doublereal*, doublereal*); /* ************************* STEST1 ***************************** */ From d8126c76e770efe8b7394d856de58ada75e5f3b1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Oct 2023 13:38:39 +0200 Subject: [PATCH 353/718] fix prototype --- ctest/c_cblat1c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c index fa97cbf92..929f3eaf3 100644 --- a/ctest/c_cblat1c.c +++ b/ctest/c_cblat1c.c @@ -887,7 +887,7 @@ L40: /* Subroutine */ int stest1_(real* scomp1, real* strue1, real* ssize, real* sfac) { static real scomp[1], strue[1]; - extern /* Subroutine */ int stest_(int*, real*, real*, real*, real*); + extern /* Subroutine */ int stest_(integer*, real*, real*, real*, real*); /* ************************* STEST1 ***************************** */ From c5e7339c9eb441c56264f30d0b998ff87335cbef Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Oct 2023 16:13:37 +0200 Subject: [PATCH 354/718] correct prototypes for INTERFACE64 builds --- ctest/c_cblat1c.c | 12 ++++++------ ctest/c_dblat1c.c | 20 ++++++++++---------- ctest/c_sblat1c.c | 22 +++++++++++----------- ctest/c_zblat1c.c | 10 +++++----- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c index 929f3eaf3..b4c512436 100644 --- a/ctest/c_cblat1c.c +++ b/ctest/c_cblat1c.c @@ -441,11 +441,11 @@ static real c_b43 = (float)1.; static complex mwpcs[5], mwpct[5]; extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); static complex cx[8]; - extern real scnrm2test_(int*, complex*, int*); + extern real scnrm2test_(integer*, complex*, integer*); static integer np1; - extern integer icamaxtest_(int*, complex*, int*); - extern /* Subroutine */ int csscaltest_(int*, real*, complex*, int*); - extern real scasumtest_(int*, complex*, int*); + extern integer icamaxtest_(integer*, complex*, integer*); + extern /* Subroutine */ int csscaltest_(integer*, real*, complex*, integer*); + extern real scasumtest_(integer*, complex*, integer*); static integer len; /* .. Parameters .. */ @@ -736,8 +736,8 @@ static real c_b43 = (float)1.; static complex ctemp; extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); static integer ksize; - extern /* Subroutine */ int cdotctest_(int*, complex*, int*, complex*, int*,complex*), ccopytest_(int*, complex*, int*, complex*, int*), cdotutest_(int*, complex*, int*, complex*, int*, complex*), - cswaptest_(int*, complex*, int*, complex*, int*), caxpytest_(int*, complex*, complex*, int*, complex*, int*); + extern /* Subroutine */ int cdotctest_(integer*, complex*, integer*, complex*, integer*,complex*), ccopytest_(integer*, complex*, integer*, complex*, integer*), cdotutest_(integer*, complex*, integer*, complex*, integer*, complex*), + cswaptest_(integer*, complex*, integer*, complex*, integer*), caxpytest_(integer*, complex*, complex*, integer*, complex*, integer*); static integer ki, kn; static complex cx[7], cy[7]; static integer mx, my; diff --git a/ctest/c_dblat1c.c b/ctest/c_dblat1c.c index d26cd9924..089dca4da 100644 --- a/ctest/c_dblat1c.c +++ b/ctest/c_dblat1c.c @@ -402,14 +402,14 @@ L40: /* Local variables */ static integer i__; - extern doublereal dnrm2test_(int*, doublereal*, int*); + extern doublereal dnrm2test_(integer*, doublereal*, integer*); static doublereal stemp[1], strue[8]; - extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(int*,doublereal*,doublereal*,int*); - extern doublereal dasumtest_(int*,doublereal*,int*); + extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(integer*,doublereal*,doublereal*,integer*); + extern doublereal dasumtest_(integer*,doublereal*,integer*); extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); static doublereal sx[8]; static integer np1; - extern integer idamaxtest_(int*,doublereal*,int*); + extern integer idamaxtest_(integer*,doublereal*,integer*); static integer len; /* .. Parameters .. */ @@ -515,10 +515,10 @@ L40: /* Local variables */ static integer lenx, leny; - extern doublereal ddottest_(int*,doublereal*,int*,doublereal*,int*); + extern doublereal ddottest_(integer*,doublereal*,integer*,doublereal*,integer*); static integer i__, j, ksize; - extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(int*,doublereal*,int*,doublereal*,int*), dswaptest_(int*,doublereal*,int*,doublereal*,int*), - daxpytest_(int*,doublereal*,doublereal*,int*,doublereal*,int*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); + extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(integer*,doublereal*,integer*,doublereal*,integer*), dswaptest_(integer*,doublereal*,integer*,doublereal*,integer*), + daxpytest_(integer*,doublereal*,doublereal*,integer*,doublereal*,integer*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*); static integer ki, kn, mx, my; static doublereal sx[7], sy[7], stx[7], sty[7]; @@ -618,9 +618,9 @@ L40: ; /* Local variables */ - extern /* Subroutine */ int drottest_(int*,doublereal*,int*,doublereal*,int*,doublereal*,doublereal*); + extern /* Subroutine */ int drottest_(integer*,doublereal*,integer*,doublereal*,integer*,doublereal*,doublereal*); static integer i__, k, ksize; - extern /* Subroutine */int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(int*,doublereal*,int*,doublereal*,int*,doublereal*); + extern /* Subroutine */int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(integer*,doublereal*,integer*,doublereal*,integer*,doublereal*); static integer ki, kn; static doublereal dparam[5], sx[10], sy[10], stx[10], sty[10]; @@ -799,7 +799,7 @@ doublereal sdiff_(doublereal* sa, doublereal* sb) return ret_val; } /* sdiff_ */ -/* Subroutine */ int itest1_(int* icomp, int* itrue) +/* Subroutine */ int itest1_(integer* icomp, integer* itrue) { /* Local variables */ static integer id; diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c index 7c049b796..7a81e04c1 100644 --- a/ctest/c_sblat1c.c +++ b/ctest/c_sblat1c.c @@ -433,14 +433,14 @@ L40: /* Local variables */ static integer i__; - extern real snrm2test_(int*,real*,int*); + extern real snrm2test_(integer*,real*,integer*); static real stemp[1], strue[8]; - extern /* Subroutine */ int stest_(int*, real*,real*,real*,real*), sscaltest_(int*,real*,real*,int*); - extern real sasumtest_(int*,real*,int*); + extern /* Subroutine */ int stest_(integer*, real*,real*,real*,real*), sscaltest_(integer*,real*,real*,integer*); + extern real sasumtest_(integer*,real*,integer*); extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(real*,real*,real*,real*); static real sx[8]; static integer np1; - extern integer isamaxtest_(int*,real*,int*); + extern integer isamaxtest_(integer*,real*,integer*); static integer len; @@ -590,10 +590,10 @@ L40: /* Local variables */ static integer lenx, leny; - extern real sdottest_(int*,real*,int*,real*,int*); + extern real sdottest_(integer*,real*,integer*,real*,integer*); static integer i__, j, ksize; - extern /* Subroutine */ int stest_(int*,real*,real*,real*,real*), scopytest_(int*,real*,int*,real*,int*), sswaptest_(int*,real*,int*,real*,int*), - saxpytest_(int*,real*,real*,int*,real*,int*); + extern /* Subroutine */ int stest_(integer*,real*,real*,real*,real*), scopytest_(integer*,real*,integer*,real*,integer*), sswaptest_(integer*,real*,integer*,real*,integer*), + saxpytest_(integer*,real*,real*,integer*,real*,integer*); static integer ki; extern /* Subroutine */ int stest1_(real*,real*,real*,real*); static integer kn, mx, my; @@ -708,9 +708,9 @@ L40: 1.17 }; /* Local variables */ - extern /* Subroutine */ void srottest_(int*,real*,int*,real*,int*,real*,real*); + extern /* Subroutine */ void srottest_(integer*,real*,integer*,real*,integer*,real*,real*); static integer i__, k, ksize; - extern /* Subroutine */ int stest_(int*,real*,real*,real*,real*), srotmtest_(int*,real*,int*,real*,int*,real*); + extern /* Subroutine */ int stest_(integer*,real*,real*,real*,real*), srotmtest_(integer*,real*,integer*,real*,integer*,real*); static integer ki, kn; static real sx[19], sy[19], sparam[5], stx[19], sty[19]; @@ -781,7 +781,7 @@ L40: return 0; } /* check3_ */ -/* Subroutine */ int stest_(int* len, real* scomp, real* strue, real* ssize, real* sfac) +/* Subroutine */ int stest_(integer* len, real* scomp, real* strue, real* ssize, real* sfac) { integer i__1; real r__1, r__2, r__3, r__4, r__5; @@ -847,7 +847,7 @@ L40: /* Subroutine */ int stest1_(real* scomp1, real* strue1, real* ssize, real* sfac) { static real scomp[1], strue[1]; - extern /* Subroutine */ int stest_(int*,real*,real*,real*,real*); + extern /* Subroutine */ int stest_(integer*,real*,real*,real*,real*); /* ************************* STEST1 ***************************** */ diff --git a/ctest/c_zblat1c.c b/ctest/c_zblat1c.c index d8cff5dee..f7c0515fc 100644 --- a/ctest/c_zblat1c.c +++ b/ctest/c_zblat1c.c @@ -380,7 +380,7 @@ static doublereal c_b43 = 1.; static integer i__; extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*); static doublecomplex mwpcs[5], mwpct[5]; - extern /* Subroutine */ int zscaltest_(int*, doublereal*, doublecomplex*, int*), itest1_(int*, int*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*); + extern /* Subroutine */ int zscaltest_(integer*, doublereal*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*); static doublecomplex cx[8]; extern doublereal dznrm2test_(integer*, doublecomplex*, integer*); static integer np1; @@ -591,11 +591,11 @@ static doublereal c_b43 = 1.; extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*); static integer ksize; static doublecomplex ztemp; - extern /* Subroutine */ int zdotctest_(int*, doublecomplex*, int*, doublecomplex*, int*, doublecomplex*), zcopytest_(int*, doublecomplex*, int*, doublecomplex*, int*); + extern /* Subroutine */ int zdotctest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*), zcopytest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*); static integer ki; - extern /* Subroutine */ int zdotutest_(int*, doublecomplex*, int*, doublecomplex*, int*, doublecomplex*), zswaptest_(int*, doublecomplex*, int*, doublecomplex*, int*); + extern /* Subroutine */ int zdotutest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*), zswaptest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*); static integer kn; - extern /* Subroutine */ int zaxpytest_(int*, doublereal*, doublecomplex*, int*, doublecomplex*, int*); + extern /* Subroutine */ int zaxpytest_(integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*); static doublecomplex cx[7], cy[7]; static integer mx, my; @@ -747,7 +747,7 @@ L40: /* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac) { static doublereal scomp[1], strue[1]; - extern /* Subroutine */ integer stest_(int*,doublereal*, doublereal*, doublereal*, doublereal*); + extern /* Subroutine */ int stest_(integer*,doublereal*, doublereal*, doublereal*, doublereal*); /* ************************* STEST1 ***************************** */ From 103d6f4e42fbe532ae4ea48e8d90d7d792bc93d2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 10 Oct 2023 16:15:52 +0200 Subject: [PATCH 355/718] Require "classic ld" with XCODE 15.x on Mac --- Makefile.system | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile.system b/Makefile.system index b1a357fdf..77c36c870 100644 --- a/Makefile.system +++ b/Makefile.system @@ -405,6 +405,13 @@ export MACOSX_DEPLOYMENT_TARGET=10.8 endif endif MD5SUM = md5 -r +XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.Xcode |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.) +ifeq (x$(XCVER)x,xx) +XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.) +endif +ifeq (x$(XCVER), x 15) +CCOMMON_OPT += -Wl,-ld_classic +endif endif ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) From 82fc29a57a403c2e6567d06e7dbaf58ae1fe30a5 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 11 Oct 2023 17:04:42 -0500 Subject: [PATCH 356/718] POWER10: Fallback to POWER8 functions As cgemm and zgemm kernels are not optimized for big endian falling back to POWER8 versions. Tested on AIX using gcc and Open XL C. --- kernel/power/KERNEL.POWER10 | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 79d889fe0..58f865322 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -19,8 +19,13 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c +ifeq ($(OSNAME), AIX) +CTRMMKERNEL = ctrmm_kernel_8x4_power8.S +ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +else CTRMMKERNEL = cgemm_kernel_power10.S ZTRMMKERNEL = zgemm_kernel_power10.S +endif SGEMMKERNEL = sgemm_kernel_power10.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c @@ -62,10 +67,18 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c +ifeq ($(OSNAME), AIX) +CGEMMKERNEL = cgemm_kernel_8x4_power8.S +else CGEMMKERNEL = cgemm_kernel_power10.S +endif #CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +ifeq ($(OSNAME), AIX) +CGEMMITCOPY = cgemm_tcopy_8_power8.S +else CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +endif CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) @@ -73,7 +86,11 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +ifeq ($(OSNAME), AIX) +ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +else ZGEMMKERNEL = zgemm_kernel_power10.S +endif ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c @@ -124,6 +141,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # +ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGTEQ9),1) ISAMAXKERNEL = isamax_power9.S else @@ -148,6 +166,15 @@ ICAMINKERNEL = icamin_power9.S else ICAMINKERNEL = icamin.c endif +else +ISAMAXKERNEL = isamax.c +IDAMAXKERNEL = idamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c +endif IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c From 71d733e5f735d18f93ab40d15d85b5129980ec91 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 11 Oct 2023 17:18:42 -0500 Subject: [PATCH 357/718] POWER: Avoid m4 conversions for C files This patch removes intermediate m4 conversions used in sbgemm compilation as it is not needed for .c files. Tested on AIX with gcc and IBM Open XL C. --- kernel/Makefile.L3 | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 174a1d41b..7db9d7907 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -634,15 +634,7 @@ $(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY) $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY) - -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s - m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@ - rm sbgemmotcopy.s sbgemmotcopy_nomacros.s -else $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) @@ -650,14 +642,7 @@ $(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY) $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s - m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@ - rm sbgemmitcopy.s sbgemmitcopy_nomacros.s -else $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif endif endif @@ -829,15 +814,8 @@ endif ifeq ($(BUILD_BFLOAT16), 1) $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s - m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s -else $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif -endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) From 97a61d0577bcfefe92df0515ece079234400e7eb Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 11 Oct 2023 17:36:43 -0500 Subject: [PATCH 358/718] Fix bfloat16_bits union so that it always the sizeof unsigned short. --- test/compare_sgemm_sbgemm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 276fecae9..57d416c94 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. typedef union { unsigned short v; - struct + struct __attribute__((packed)) { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ unsigned short s:1; From e98e3c4783fe9250c9bda11cada6e82ef627854e Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 11 Oct 2023 18:05:55 -0500 Subject: [PATCH 359/718] Fix float32_bits union so that it always the sizeof float. --- test/compare_sgemm_sbgemm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 57d416c94..b723b6a3b 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -49,7 +49,7 @@ typedef union typedef union { float v; - struct + struct __attribute__((packed)) { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ uint32_t s:1; From e7d05402e02c6cb3e9cca60cff976927cbd4f506 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Oct 2023 14:24:53 +0200 Subject: [PATCH 360/718] Fix up S/D GEMM copy function definitions after #4009 --- kernel/arm64/KERNEL.A64FX | 90 +++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 50 deletions(-) diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index bd25f7cd8..ccbce27e1 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S ZAMAXKERNEL = zamax.S SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S CAXPYKERNEL = zaxpy.S ZAXPYKERNEL = zaxpy.S @@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S @@ -128,10 +118,10 @@ SGEMM_BETA = sgemm_beta.S SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S +SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c +SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) @@ -149,8 +139,8 @@ SSYMMLCOPY_M = symm_lcopy_sve.c DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c +DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S @@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c @@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c From d46eba06a7a95a61649cf25a1fd350f46d8050b6 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Thu, 12 Oct 2023 09:41:33 -0500 Subject: [PATCH 361/718] Pack structure only on AIX. --- test/compare_sgemm_sbgemm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index b723b6a3b..cf808b56d 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -32,7 +32,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. typedef union { unsigned short v; +#if defined(_AIX) struct __attribute__((packed)) +#else + struct +#endif { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ unsigned short s:1; @@ -49,7 +53,11 @@ typedef union typedef union { float v; +#if defined(_AIX) struct __attribute__((packed)) +#else + struct +#endif { #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ uint32_t s:1; From fe75c88a2c65240f4b7265be3dbc4b93132ec24f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Oct 2023 18:20:09 +0200 Subject: [PATCH 362/718] AzureCI: move OSX-Clang jobs to macOS-12 to resolve setup/build timeouts --- azure-pipelines.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index ff56ad00b..317bc504a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -167,11 +167,10 @@ jobs: - job: OSX_OpenMP_Clang pool: - vmImage: 'macOS-11' + vmImage: 'macOS-latest' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib - MACOSX_DEPLOYMENT_TARGET: 11.0 steps: - script: | brew update @@ -180,7 +179,7 @@ jobs: - job: OSX_OpenMP_Clang_cmake pool: - vmImage: 'macOS-11' + vmImage: 'macOS-latest' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -210,7 +209,7 @@ jobs: - job: OSX_Ifort_Clang pool: - vmImage: 'macOS-11' + vmImage: 'macOS-latest' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg From 9f42570e33db010665d65bc50f803a673fcc5af0 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 12 Oct 2023 12:37:40 -0500 Subject: [PATCH 363/718] POWER: Increase macro size limit for AIX This patch increases the macro size limit from 4096 to 16384 to allow compiling larger assembly files in AIX. Tested with GCC and IBM Open XL C. --- kernel/Makefile.L3 | 97 +++++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 44 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 7db9d7907..448e096a3 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -61,6 +61,15 @@ ifeq ($(CORE), ZEN) USE_TRMM = 1 endif +ifeq ($(OS), AIX) +M4VERSION := $(shell m4 --version < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) +ifeq ($(M4VERSION), 0) +M4_AIX := m4 -l16384 +else +M4_AIX := m4 -B16384 +endif +$(info $$var is [${$(M4_AIX)}]) +endif ifeq ($(CORE), POWER8) ifeq ($(BINARY64),1) USE_TRMM = 1 @@ -653,7 +662,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s - m4 sgemmotcopy.s > sgemmotcopy_nomacros.s + $(M4_AIX) sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s else @@ -669,7 +678,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s - m4 sgemmitcopy.s > sgemmitcopy_nomacros.s + $(M4_AIX) sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s else @@ -681,7 +690,7 @@ endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s - m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s + $(M4_AIX) dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ rm dgemm_ncopy.s dgemm_ncopy_nomacros.s else @@ -699,7 +708,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s - m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s + $(M4_AIX) dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ rm dgemm_itcopy.s dgemm_itcopy_nomacros.s else @@ -742,7 +751,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s - m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s + $(M4_AIX) cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ rm cgemm_itcopy.s cgemm_itcopy_nomacros.s else @@ -765,7 +774,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s - m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s + $(M4_AIX) zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ rm zgemm_itcopy.s zgemm_itcopy_nomacros.s else @@ -797,7 +806,7 @@ endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s - m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s + $(M4_AIX) sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s else @@ -820,7 +829,7 @@ endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s - m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s + $(M4_AIX) dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s else @@ -833,7 +842,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s - m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s + $(M4_AIX) cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s else @@ -843,7 +852,7 @@ endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s - m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s + $(M4_AIX) cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s else @@ -853,7 +862,7 @@ endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s - m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + $(M4_AIX) cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s else @@ -863,7 +872,7 @@ endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s - m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s + $(M4_AIX) cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s else @@ -873,7 +882,7 @@ endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s - m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s + $(M4_AIX) zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s else ifeq ($(CORE),SANDYBRIDGE) @@ -885,7 +894,7 @@ endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s - m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s + $(M4_AIX) zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s else ifeq ($(CORE),SANDYBRIDGE) @@ -897,7 +906,7 @@ endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s - m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s + $(M4_AIX) zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s else ifeq ($(CORE),SANDYBRIDGE) @@ -909,7 +918,7 @@ endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s - m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s + $(M4_AIX) zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s else ifeq ($(CORE),SANDYBRIDGE) @@ -935,7 +944,7 @@ ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s - m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s + $(M4_AIX) strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ rm strmmkernel_ln.s strmmkernel_ln_nomacros.s else @@ -945,7 +954,7 @@ endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s - m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s + $(M4_AIX) strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ rm strmmkernel_lt.s strmmkernel_lt_nomacros.s else @@ -955,7 +964,7 @@ endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s - m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s + $(M4_AIX) strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ rm strmmkernel_rn.s strmmkernel_rn_nomacros.s else @@ -965,7 +974,7 @@ endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(M4_AIX) strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s else @@ -975,7 +984,7 @@ endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s - m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s + $(M4_AIX) dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s else @@ -985,7 +994,7 @@ endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s - m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s + $(M4_AIX) dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s else @@ -995,7 +1004,7 @@ endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s - m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s + $(M4_AIX) dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s else @@ -1005,7 +1014,7 @@ endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s - m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s + $(M4_AIX) dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s else @@ -1027,7 +1036,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s - m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s + $(M4_AIX) ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s else @@ -1037,7 +1046,7 @@ endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s - m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s + $(M4_AIX) ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s else @@ -1047,7 +1056,7 @@ endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s - m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s + $(M4_AIX) ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s else @@ -1057,7 +1066,7 @@ endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s - m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s + $(M4_AIX) ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s else @@ -1067,7 +1076,7 @@ endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s - m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s + $(M4_AIX) ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s else @@ -1077,7 +1086,7 @@ endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s - m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s + $(M4_AIX) ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s else @@ -1087,7 +1096,7 @@ endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s - m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s + $(M4_AIX) ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s else @@ -1097,7 +1106,7 @@ endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s - m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s + $(M4_AIX) ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s else @@ -1107,7 +1116,7 @@ endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s - m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s + $(M4_AIX) ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1119,7 +1128,7 @@ endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s - m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s + $(M4_AIX) ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1131,7 +1140,7 @@ endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s - m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s + $(M4_AIX) ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1143,7 +1152,7 @@ endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s - m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s + $(M4_AIX) ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1155,7 +1164,7 @@ endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s - m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s + $(M4_AIX) ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1167,7 +1176,7 @@ endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s - m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s + $(M4_AIX) ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1179,7 +1188,7 @@ endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s - m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s + $(M4_AIX) ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1191,7 +1200,7 @@ endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s - m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s + $(M4_AIX) ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s else ifeq ($(CORE), SANDYBRIDGE) @@ -1213,7 +1222,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(M4_AIX) strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s else @@ -1373,7 +1382,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s - m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s + $(M4_AIX) dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s else @@ -2965,7 +2974,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s - m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + $(M4_AIX) cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s else @@ -3011,7 +3020,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s + $(M4_AIX) strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s else From 301e2ecc49aa9207573a062fe272f902161c48a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 16 Oct 2023 22:15:46 +0200 Subject: [PATCH 364/718] Cray Fortran uses -O in combinations like -O omp so don't filter that out --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 299970c67..8621a8b3f 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,11 @@ export NO_LAPACK export C_LAPACK endif +ifeq ($(F_COMPILER),CRAY) +LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS)) +else LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) +endif SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test From b41cab08756563819e0cbc7ab005ab746fa4721b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 16 Oct 2023 22:20:59 +0200 Subject: [PATCH 365/718] Need to use override to actually strip down the already defined FFLAGS for NAG and CCE Fortran --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 77c36c870..868cca4f9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1642,11 +1642,11 @@ endif ifeq ($(F_COMPILER),NAG) LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) -FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) endif ifeq ($(F_COMPILER),CRAY) LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) -FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) endif LAPACK_CFLAGS = $(CFLAGS) From f8c230c21c0bb20dc61d14988069ae60df6e0423 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 18 Oct 2023 11:58:54 +0200 Subject: [PATCH 366/718] Switch MINGW-W64 jobs to UCRT --- .github/workflows/dynamic_arch.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 4fe6e63fc..0c39bfddf 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -151,13 +151,13 @@ jobs: strategy: fail-fast: false matrix: - msystem: [MINGW64, MINGW32, CLANG64, CLANG32] + msystem: [UCRT64, MINGW32, CLANG64, CLANG32] idx: [int32, int64] build-type: [Release] include: - - msystem: MINGW64 + - msystem: UCRT64 idx: int32 - target-prefix: mingw-w64-x86_64 + target-prefix: mingw-w64-ucrt-x86_64 fc-pkg: fc - msystem: MINGW32 idx: int32 @@ -175,10 +175,10 @@ jobs: target-prefix: mingw-w64-clang-i686 fc-pkg: cc c-lapack-flags: -DC_LAPACK=ON - - msystem: MINGW64 + - msystem: UCRT64 idx: int64 idx64-flags: -DBINARY=64 -DINTERFACE64=1 - target-prefix: mingw-w64-x86_64 + target-prefix: mingw-w64-ucrt-x86_64 fc-pkg: fc - msystem: CLANG64 idx: int64 @@ -188,9 +188,9 @@ jobs: # Compiling with Flang 16 seems to cause test errors on machines # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. no-avx512-flags: -DNO_AVX512=1 - - msystem: MINGW64 + - msystem: UCRT64 idx: int32 - target-prefix: mingw-w64-x86_64 + target-prefix: mingw-w64-ucrt-x86_64 fc-pkg: fc build-type: None exclude: From e12aaed13d39a77bb089d5c39478ed203160f196 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 18 Oct 2023 16:28:54 +0200 Subject: [PATCH 367/718] Fix unwanted fallthrough from Intel Family 6 to 15 in case of identification failure --- driver/others/dynamic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 8e0f53f74..69a473060 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -805,7 +805,8 @@ static gotoblas_t *get_coretype(void){ } return NULL; } - case 0xf: + break; + case 0xf: if (model <= 0x2) return &gotoblas_NORTHWOOD; return &gotoblas_PRESCOTT; } From 6b8379d6d998f94ecb9a6adccc6cbd3f1d23f1c7 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Thu, 19 Oct 2023 11:38:26 +0200 Subject: [PATCH 368/718] Run nightly Homebrew cron job only on the main repo, not on forks I noticed this because GitHub emailed me that it would disable the nightly job because it hadn't changed for 3 months. It currently takes 30-50 minutes daily, and by default runs on all forks of the main repository that have the relevant workflow yaml file. That serves little purpose and wastes quite a bit of energy - so disable the runs outside of the main repo. This will not disable the runs on forks already made in the past that contain this workflow file, but it does save 3 months worth of runs on every new fork that is created. [skip ci] --- .github/workflows/nightly-Homebrew-build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index 96063565d..eb315f1d4 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -23,6 +23,7 @@ permissions: jobs: build-OpenBLAS-with-Homebrew: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: macos-latest env: DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer From 980f702f72c20d01f4110e40fc44e35352812b8b Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 19 Oct 2023 18:48:19 -0500 Subject: [PATCH 369/718] POWER: AIX: Make use of power10 optimization POWER10 optimizations are disabled when using default AIX assembler. As we have fixed many issues recently, enabling optimization path for default assembler. --- Makefile.system | 8 ++++---- kernel/power/KERNEL.POWER10 | 4 ---- param.h | 5 ----- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/Makefile.system b/Makefile.system index 868cca4f9..30b0ddec2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -277,10 +277,6 @@ endif ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 -# Determine if the assembler is GNU Assembler -HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) -GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) - # Generating Makefile.conf and config.h DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) @@ -760,7 +756,11 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif +ifeq ($(OSNAME), AIX) +LDVERSIONGTEQ35 := 1 +else LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) +endif ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) DYNAMIC_CORE += POWER10 CCOMMON_OPT += -DHAVE_P10_SUPPORT diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 58f865322..9047c714c 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -1,6 +1,3 @@ -ifeq ($(HAVE_GAS), 1) -include $(KERNELDIR)/KERNEL.POWER8 -else #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -265,4 +262,3 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c -endif diff --git a/param.h b/param.h index 03bf3624f..ee4640f57 100644 --- a/param.h +++ b/param.h @@ -2600,13 +2600,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 -#if defined(HAVE_GAS) && (HAVE_GAS == 1) -#define DGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_N 4 -#else #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 8 -#endif #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 From a7f73c764cee8fada4f7f359ae4a8be6b9810ada Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 27 Oct 2023 16:48:47 +0200 Subject: [PATCH 370/718] Clarify "make" options and the need to repeat them in the install step --- README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 081d45870..f57cec831 100644 --- a/README.md +++ b/README.md @@ -54,10 +54,15 @@ Building OpenBLAS requires the following to be installed: Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically. To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`. -The full target list is in the file `TargetList.txt`. For building with `cmake`, the -usual conventions apply, i.e. create a build directory either underneath the toplevel -OpenBLAS source directory or separate from it, and invoke `cmake` there with the path -to the source tree and any build options you plan to set. +The full target list is in the file `TargetList.txt`, other build optionss are documented in Makefile.rule and +can either be set there (typically by removing the comment character from the respective line), or used on the +`make` command line. +Note that when you run `make install` after building, you need to repeat all command line options you provided to `make` +in the build step, as some settings like the supported maximum number of threads are automatically derived from the +build host by default, which might not be what you want. +For building with `cmake`, the usual conventions apply, i.e. create a build directory either underneath the toplevel +OpenBLAS source directory or separate from it, and invoke `cmake` there with the path to the source tree and any +build options you plan to set. ### Cross compile From f5e1f20f4db408d826cb89638175f1987304cf5b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 27 Oct 2023 17:10:37 +0200 Subject: [PATCH 371/718] Update target list --- README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f57cec831..3c4e38f18 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ Use `PREFIX=` when invoking `make`, for example ```sh make install PREFIX=your_installation_directory ``` - +(along with all options you added on the `make` command line in the preceding build step) The default installation directory is `/opt/OpenBLAS`. ## Supported CPUs and Operating Systems @@ -142,7 +142,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. -- **AMD ZEN**: Uses Haswell codes with some optimizations. +- **AMD ZEN**: Uses Haswell codes with some optimizations for Zen 2/3 (use SkylakeX for Zen4) #### MIPS32 @@ -174,13 +174,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **TSV110**: Optimized some Level-3 helper functions - **EMAG 8180**: preliminary support based on A57 - **Neoverse N1**: (AWS Graviton2) preliminary support -- **Apple Vortex**: preliminary support based on ARMV8 +- **Neoverse V1**: (AWS Graviton3) optimized Level-3 BLAS +- **Apple Vortex**: preliminary support based on ThunderX2/3 +- **A64FX**: preliminary support, optimized Level-3 BLAS +- **ARMV8SVE**: any ARMV8 cpu with SVE extensions #### PPC/PPC64 - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. -- **POWER10**: +- **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2. #### IBM zEnterprise System From 1cec1c0fc7509a949b65ce5bb50696c18838046e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Oct 2023 14:43:19 +0200 Subject: [PATCH 372/718] Add FreeBSD clang/gfortran build with OpenMP --- .cirrus.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 02cd40997..c405b958d 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -148,6 +148,15 @@ FreeBSD_task: - ls -l /usr/local/lib - gmake CC=gcc INTERFACE64=1 +FreeBSD_task: + name: FreeBSD-clang-openmp + freebsd_instance: + image_family: freebsd-13-2 + install_script: + - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc + compile_script: + - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1 + #task: # name: Windows/LLVM16 --- too slow --- # windows_container: From 289a5f6d9b8570de6fa5c2bf2789e04abce494ea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Oct 2023 18:44:58 +0200 Subject: [PATCH 373/718] work around libgfortran install issue on FreeBSD --- .cirrus.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.cirrus.yml b/.cirrus.yml index c405b958d..5a1f2cfda 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -154,6 +154,7 @@ FreeBSD_task: image_family: freebsd-13-2 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc + - ln -s /usr/local/gcc12/lib/libgfortran.so.5.0.0 /usr/lib/libgfortran.so compile_script: - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1 From dc1c880782e33307aaa2b04467b110003f3305e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Oct 2023 23:14:36 +0200 Subject: [PATCH 374/718] fix libgfortran path on bsd --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 5a1f2cfda..6c2baf8a0 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -154,7 +154,7 @@ FreeBSD_task: image_family: freebsd-13-2 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc - - ln -s /usr/local/gcc12/lib/libgfortran.so.5.0.0 /usr/lib/libgfortran.so + - ln -s /usr/local/lib/gcc12/libgfortran.so.5.0.0 /usr/lib/libgfortran.so compile_script: - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1 From d003ad630b1792f169373b8ab35c5ea7a6dfdccd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 31 Oct 2023 10:26:38 +0100 Subject: [PATCH 375/718] Increase the default GEMM buffer size on modern ARM server cpus --- common_arm64.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common_arm64.h b/common_arm64.h index 436ccb8f5..1e593c66f 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -162,7 +162,11 @@ REALNAME: #define HUGE_PAGESIZE ( 4 << 20) #ifndef BUFFERSIZE +if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) +#define BUFFER_SIZE (32 << 22) +else #define BUFFER_SIZE (32 << 20) +#endif #else #define BUFFER_SIZE (32 << BUFFERSIZE) #endif From 728788f6676bb5e999cdf4fbcda9e2c7b8b9cd53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 31 Oct 2023 11:08:22 +0100 Subject: [PATCH 376/718] typo fix --- common_arm64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index 1e593c66f..6ae6a35a3 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -162,9 +162,9 @@ REALNAME: #define HUGE_PAGESIZE ( 4 << 20) #ifndef BUFFERSIZE -if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) +#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) #define BUFFER_SIZE (32 << 22) -else +#else #define BUFFER_SIZE (32 << 20) #endif #else From c8882bd9d890c332adaf992a0b9da6be8384bb97 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 1 Nov 2023 14:53:55 -0500 Subject: [PATCH 377/718] Remove POWER7 from cpu list. --- driver/others/dynamic_power.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index db04e635f..b4a1cc6be 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -39,7 +39,6 @@ char *gotoblas_corename(void) { #define CPU_UNKNOWN 0 #define CPU_POWER5 5 #define CPU_POWER6 6 -#define CPU_POWER7 7 #define CPU_POWER8 8 #define CPU_POWER9 9 #define CPU_POWER10 10 @@ -53,9 +52,6 @@ static int cpuid(void) #ifdef POWER_6 if (arch == POWER_6) return CPU_POWER6; #endif -#ifdef POWER_7 - else if (arch == POWER_7) return CPU_POWER7; -#endif #ifdef POWER_8 else if (arch == POWER_8) return CPU_POWER8; #endif From 7dcb2d67f23caa8b70df4ea37c05a12ff8c15898 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 1 Nov 2023 15:23:28 -0500 Subject: [PATCH 378/718] Have POWER7 return arch=POWER6. --- driver/others/dynamic_power.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index b4a1cc6be..10a5d64b3 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -52,6 +52,9 @@ static int cpuid(void) #ifdef POWER_6 if (arch == POWER_6) return CPU_POWER6; #endif +#ifdef POWER_7 + else if (arch == POWER_7) return CPU_POWER6; +#endif #ifdef POWER_8 else if (arch == POWER_8) return CPU_POWER8; #endif From 3bfa4d4dccf8616ab330387a7be1ebd709a3214c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Nov 2023 14:55:31 +0100 Subject: [PATCH 379/718] Fix outdated SVE kernel definitions for Cortex cpus by aliasing to ARMV8SVE --- kernel/arm64/KERNEL.CORTEXA510 | 217 +-------------------------------- kernel/arm64/KERNEL.CORTEXA710 | 217 +-------------------------------- kernel/arm64/KERNEL.CORTEXX2 | 217 +-------------------------------- 3 files changed, 3 insertions(+), 648 deletions(-) diff --git a/kernel/arm64/KERNEL.CORTEXA510 b/kernel/arm64/KERNEL.CORTEXA510 index bd25f7cd8..bc5999097 100644 --- a/kernel/arm64/KERNEL.CORTEXA510 +++ b/kernel/arm64/KERNEL.CORTEXA510 @@ -1,216 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.CORTEXA710 b/kernel/arm64/KERNEL.CORTEXA710 index bd25f7cd8..bc5999097 100644 --- a/kernel/arm64/KERNEL.CORTEXA710 +++ b/kernel/arm64/KERNEL.CORTEXA710 @@ -1,216 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.CORTEXX2 b/kernel/arm64/KERNEL.CORTEXX2 index bd25f7cd8..bc5999097 100644 --- a/kernel/arm64/KERNEL.CORTEXX2 +++ b/kernel/arm64/KERNEL.CORTEXX2 @@ -1,216 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = sgemm_ncopy_sve_v1.c -SGEMMITCOPY = sgemm_tcopy_sve_v1.c -SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = dgemm_ncopy_sve_v1.c -DGEMMITCOPY = dgemm_tcopy_sve_v1.c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE From 9019bc494514a74c2042152cdca0a36adea7b42f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 Nov 2023 22:10:06 +0100 Subject: [PATCH 380/718] Use SkylakeX ?ASUM microkernel for Cooperlake/Sapphirerapids as well --- kernel/x86_64/casum.c | 2 +- kernel/x86_64/dasum.c | 2 +- kernel/x86_64/sasum.c | 2 +- kernel/x86_64/zasum.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c index 60feec0ce..e4d054311 100644 --- a/kernel/x86_64/casum.c +++ b/kernel/x86_64/casum.c @@ -4,7 +4,7 @@ #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) #endif -#if defined(SKYLAKEX) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "casum_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index a9c40f38f..0147c6978 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -4,7 +4,7 @@ #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) #endif -#if defined(SKYLAKEX) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "dasum_microk_skylakex-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dasum_microk_haswell-2.c" diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index 37a92468f..3f22cb97a 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -9,7 +9,7 @@ #endif -#if defined(SKYLAKEX) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "sasum_microk_skylakex-2.c" #elif defined(HASWELL) || defined(ZEN) #include "sasum_microk_haswell-2.c" diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c index 80e95a2c8..3f17ab1cf 100644 --- a/kernel/x86_64/zasum.c +++ b/kernel/x86_64/zasum.c @@ -4,7 +4,7 @@ #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) #endif -#if defined(SKYLAKEX) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "zasum_microk_skylakex-2.c" #endif From 04bc801999e8d6e6ed101a1ab8ec9720f271ad2c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 Nov 2023 23:48:59 +0100 Subject: [PATCH 381/718] (Re)apply fixes for supporting only a subset of precision types from PR 3915 --- kernel/Makefile.L3 | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 448e096a3..863f376e9 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -182,7 +182,7 @@ ifeq ($(BUILD_BFLOAT16),1) SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) endif -ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" SBLASOBJS += \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -191,7 +191,7 @@ SBLASOBJS += \ strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) endif -ifeq ($(BUILD_DOUBLE),1) +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DBLASOBJS += \ dgemm_beta$(TSUFFIX).$(SUFFIX) \ dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -207,7 +207,7 @@ QBLASOBJS += \ qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) -ifeq ($(BUILD_COMPLEX),1) +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CBLASOBJS += \ ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ @@ -255,7 +255,7 @@ XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX) endif -ifeq ($(BUILD_SINGLE),1) +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" SBLASOBJS += \ strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \ strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ @@ -264,10 +264,7 @@ SBLASOBJS += \ strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ - strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) -endif -ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" -SBLASOBJS += \ + strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \ strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \ strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \ @@ -275,10 +272,7 @@ SBLASOBJS += \ strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \ strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \ strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \ - strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) -endif -ifeq ($(BUILD_SINGLE),1) -SBLASOBJS += \ + strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \ ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \ ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX) endif @@ -400,7 +394,7 @@ XBLASOBJS += \ ifeq ($(USE_GEMM3M), 1) -ifeq ($(BUILD_COMPLEX),1) +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CBLASOBJS += \ cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ From 3a86fde5a7f762fe94196c6e090377721bc551b5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 Nov 2023 23:52:46 +0100 Subject: [PATCH 382/718] Delete misplaced file from PR 3915 after applying in the correct place --- Makefile.L3 | 5089 --------------------------------------------------- 1 file changed, 5089 deletions(-) delete mode 100644 Makefile.L3 diff --git a/Makefile.L3 b/Makefile.L3 deleted file mode 100644 index 76586d826..000000000 --- a/Makefile.L3 +++ /dev/null @@ -1,5089 +0,0 @@ -USE_GEMM3M = 0 -OS := $(shell uname) - -ifeq ($(ARCH), x86) -USE_GEMM3M = 1 -endif - -ifeq ($(ARCH), x86_64) -USE_GEMM3M = 1 -endif - -ifeq ($(ARCH), x86_64) -USE_DIRECT_SGEMM = 1 -endif - -ifeq ($(ARCH), ia64) -USE_GEMM3M = 1 -endif - -ifeq ($(ARCH), arm) -USE_TRMM = 1 -endif - -ifeq ($(ARCH), arm64) -USE_TRMM = 1 -endif - -ifeq ($(ARCH), riscv64) -USE_TRMM = 1 -endif - -ifneq ($(DYNAMIC_ARCH), 1) -ifeq ($(TARGET), GENERIC) -USE_TRMM = 1 -endif -endif - -ifeq ($(CORE), HASWELL) -USE_TRMM = 1 -endif - -ifeq ($(CORE), SKYLAKEX) -USE_TRMM = 1 -endif - -ifeq ($(CORE), COOPERLAKE) -USE_TRMM = 1 -endif - -ifeq ($(CORE), SAPPHIRERAPIDS) -USE_TRMM = 1 -endif - -ifeq ($(CORE), ZEN) -USE_TRMM = 1 -endif - -ifeq ($(CORE), POWER8) -ifeq ($(BINARY64),1) -USE_TRMM = 1 -endif -endif - -ifeq ($(CORE), POWER9) -USE_TRMM = 1 -endif - -ifeq ($(CORE), POWER10) -USE_TRMM = 1 -endif - -ifeq ($(ARCH), zarch) -USE_TRMM = 1 -endif - -ifeq ($(CORE), Z14) -USE_TRMM = 1 -endif - -ifdef USE_DIRECT_SGEMM -ifndef SGEMMDIRECTKERNEL -SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c -SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c -endif -endif - -ifeq ($(BUILD_BFLOAT16), 1) -ifndef SBGEMMKERNEL -SBGEMM_BETA = ../generic/gemm_beta.c -SBGEMMKERNEL = ../generic/gemmkernel_2x2.c -SBGEMMINCOPY = ../generic/gemm_ncopy_2.c -SBGEMMITCOPY = ../generic/gemm_tcopy_2.c -SBGEMMONCOPY = ../generic/gemm_ncopy_2.c -SBGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) -SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) -SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) -SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) -endif - -SBKERNELOBJS += \ - sbgemm_kernel$(TSUFFIX).$(SUFFIX) \ - $(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \ - $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) -endif - -ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" -SKERNELOBJS += \ - sgemm_kernel$(TSUFFIX).$(SUFFIX) \ - sgemm_beta$(TSUFFIX).$(SUFFIX) \ - $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ - $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) - -ifdef USE_DIRECT_SGEMM -SKERNELOBJS += \ - sgemm_direct$(TSUFFIX).$(SUFFIX) \ - sgemm_direct_performant$(TSUFFIX).$(SUFFIX) -endif -endif - -ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" -DKERNELOBJS += \ - dgemm_beta$(TSUFFIX).$(SUFFIX) \ - dgemm_kernel$(TSUFFIX).$(SUFFIX) \ - $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ - $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) -endif - -QKERNELOBJS += \ - qgemm_kernel$(TSUFFIX).$(SUFFIX) \ - $(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \ - $(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ) - -ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" -CKERNELOBJS += \ - cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ - cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ - $(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \ - $(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ) -endif - -ifeq ($(BUILD_COMPLEX16),1) -ZKERNELOBJS += \ - zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ - zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ - $(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \ - $(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ) -endif - -XKERNELOBJS += \ - xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ - xgemm_kernel_l$(TSUFFIX).$(SUFFIX) xgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ - $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ - $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) - -ifeq ($(BUILD_BFLOAT16),1) -SBBLASOBJS += $(SBKERNELOBJS) -endif -SBLASOBJS += $(SKERNELOBJS) -DBLASOBJS += $(DKERNELOBJS) -QBLASOBJS += $(QKERNELOBJS) -CBLASOBJS += $(CKERNELOBJS) -ZBLASOBJS += $(ZKERNELOBJS) -XBLASOBJS += $(XKERNELOBJS) - -ifeq ($(BUILD_BFLOAT16),1) -SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) -endif - -ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" -SBLASOBJS += \ - sgemm_beta$(TSUFFIX).$(SUFFIX) \ - strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) -endif - -ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" -DBLASOBJS += \ - dgemm_beta$(TSUFFIX).$(SUFFIX) \ - dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) -endif - -QBLASOBJS += \ - qgemm_beta$(TSUFFIX).$(SUFFIX) \ - qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) - -ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" -CBLASOBJS += \ - ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) -endif -ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" -CBLASOBJS += \ - cgemm_beta$(TSUFFIX).$(SUFFIX) \ - ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_COMPLEX16),1) -ZBLASOBJS += \ - zgemm_beta$(TSUFFIX).$(SUFFIX) \ - ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ - ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) -endif - -XBLASOBJS += \ - xgemm_beta$(TSUFFIX).$(SUFFIX) \ - xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ - xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ - xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) - -ifeq ($(USE_GEMM3M), 1) - -CBLASOBJS += cgemm3m_kernel$(TSUFFIX).$(SUFFIX) -ZBLASOBJS += zgemm3m_kernel$(TSUFFIX).$(SUFFIX) -XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX) - -endif - -ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" -SBLASOBJS += \ - strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - strmm_iutucopy$(TSUFFIX).$(SUFFIX) strmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - strmm_iltucopy$(TSUFFIX).$(SUFFIX) strmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ - strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - strsm_iltucopy$(TSUFFIX).$(SUFFIX) strsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \ - strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_DOUBLE),1) -DBLASOBJS += \ - dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ - dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_outucopy$(TSUFFIX).$(SUFFIX) dtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX) -endif - -QBLASOBJS += \ - qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_outucopy$(TSUFFIX).$(SUFFIX) qtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ - qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) - -ifeq ($(BUILD_COMPLEX),1) -CBLASOBJS += \ - ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \ - ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \ - chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \ - chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX) -endif -ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" -CBLASOBJS += \ - ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_COMPLEX16),1) -ZBLASOBJS += \ - ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_outucopy$(TSUFFIX).$(SUFFIX) ztrmm_outncopy$(TSUFFIX).$(SUFFIX) \ - ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_outucopy$(TSUFFIX).$(SUFFIX) ztrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - zsymm_iutcopy$(TSUFFIX).$(SUFFIX) zsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ - zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ - zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX) -endif - -XBLASOBJS += \ - xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_outucopy$(TSUFFIX).$(SUFFIX) xtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ - xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_outucopy$(TSUFFIX).$(SUFFIX) xtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - xsymm_iutcopy$(TSUFFIX).$(SUFFIX) xsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - xsymm_outcopy$(TSUFFIX).$(SUFFIX) xsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ - xhemm_iutcopy$(TSUFFIX).$(SUFFIX) xhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ - xhemm_outcopy$(TSUFFIX).$(SUFFIX) xhemm_oltcopy$(TSUFFIX).$(SUFFIX) - -ifeq ($(USE_GEMM3M), 1) - -ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" -CBLASOBJS += \ - cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ - cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ - cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ - cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ - cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ - cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ - csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ - chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_COMPLEX16),1) -ZBLASOBJS += \ - zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ - zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ - zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ - zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ - zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ - zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ - zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ - zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) -endif - -XBLASOBJS += \ - xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ - xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ - xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ - xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ - xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ - xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ - xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ - xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ - xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ - xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ - xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ - xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ - xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) - -endif - -###### BLAS small matrix optimization ##### -ifeq ($(SMALL_MATRIX_OPT), 1) - -ifeq ($(BUILD_BFLOAT16),1) -SBBLASOBJS += \ - sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ - sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ - sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) -endif - -SBLASOBJS += \ - sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) - -DBLASOBJS += \ - dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) - -CBLASOBJS += \ - cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ - cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) - -ZBLASOBJS += \ - zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ - zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) - -endif - -###### BLAS extensions ##### - -ifeq ($(BUILD_SINGLE),1) -SBLASOBJS += \ - somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - sgeadd_k$(TSUFFIX).$(SUFFIX) -endif -ifeq ($(BUILD_DOUBLE),1) -DBLASOBJS += \ - domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - dgeadd_k$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_COMPLEX),1) -CBLASOBJS += \ - comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ - cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ - cgeadd_k$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_COMPLEX16),1) -ZBLASOBJS += \ - zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ - zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ - zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ - zgeadd_k$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(BUILD_BFLOAT16), 1) -SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -endif - -SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SGEMMOTCOPYOBJ_P = $(SGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -DGEMMINCOPYOBJ_P = $(DGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -DGEMMITCOPYOBJ_P = $(DGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -DGEMMONCOPYOBJ_P = $(DGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -DGEMMOTCOPYOBJ_P = $(DGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -QGEMMINCOPYOBJ_P = $(QGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -QGEMMITCOPYOBJ_P = $(QGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -QGEMMONCOPYOBJ_P = $(QGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -QGEMMOTCOPYOBJ_P = $(QGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -CGEMMINCOPYOBJ_P = $(CGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -CGEMMITCOPYOBJ_P = $(CGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -CGEMMONCOPYOBJ_P = $(CGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -CGEMMOTCOPYOBJ_P = $(CGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ZGEMMINCOPYOBJ_P = $(ZGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ZGEMMITCOPYOBJ_P = $(ZGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ZGEMMONCOPYOBJ_P = $(ZGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ZGEMMOTCOPYOBJ_P = $(ZGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -XGEMMINCOPYOBJ_P = $(XGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) - -ifeq ($(BUILD_BFLOAT16),1) -$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif - -$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)qgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)cgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ - -$(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ - -$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ - -ifeq ($(ARCH), E2K) -USE_TRMM = 1 -endif - - -ifeq ($(BUILD_BFLOAT16), 1) - -$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY) - -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s - m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@ - rm sbgemmotcopy.s sbgemmotcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif - -ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) - -$(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s - m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@ - rm sbgemmitcopy.s sbgemmitcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif - -endif -endif - -$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s - m4 sgemmotcopy.s > sgemmotcopy_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ - rm sgemmotcopy.s sgemmotcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -endif - - -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) - -$(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s - m4 sgemmitcopy.s > sgemmitcopy_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ - rm sgemmitcopy.s sgemmitcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -endif - -endif - -$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s - m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ - rm dgemm_ncopy.s dgemm_ncopy_nomacros.s -else - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -endif - -$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -$(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s - m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ - rm dgemm_itcopy.s dgemm_itcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -endif - -endif - -ifdef EXPRECISION - -$(KDIR)$(QGEMMONCOPYOBJ) : $(KERNELDIR)/$(QGEMMONCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(QGEMMOTCOPYOBJ) : $(KERNELDIR)/$(QGEMMOTCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) - -$(KDIR)$(QGEMMINCOPYOBJ) : $(KERNELDIR)/$(QGEMMINCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(QGEMMITCOPYOBJ) : $(KERNELDIR)/$(QGEMMITCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -endif - -endif - -$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) - -$(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s - m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ - rm cgemm_itcopy.s cgemm_itcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -endif - -endif - -$(KDIR)$(ZGEMMONCOPYOBJ) : $(KERNELDIR)/$(ZGEMMONCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(ZGEMMOTCOPYOBJ) : $(KERNELDIR)/$(ZGEMMOTCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) - -$(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s - m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ - rm zgemm_itcopy.s zgemm_itcopy_nomacros.s -else - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -endif - -endif - -ifdef EXPRECISION - -$(KDIR)$(XGEMMONCOPYOBJ) : $(KERNELDIR)/$(XGEMMONCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(XGEMMOTCOPYOBJ) : $(KERNELDIR)/$(XGEMMOTCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) - -$(KDIR)$(XGEMMINCOPYOBJ) : $(KERNELDIR)/$(XGEMMINCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)$(XGEMMITCOPYOBJ) : $(KERNELDIR)/$(XGEMMITCOPY) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -endif - -endif - -$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s - m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -endif - -ifdef USE_DIRECT_SGEMM -$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -endif - -ifeq ($(BUILD_BFLOAT16), 1) - -$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s - m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s -else - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif -endif - -$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s - m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s -else - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -endif - -$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) - $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s - m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ - rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ -endif - -$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s - m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ - rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ -endif - -$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s - m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ - rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ -endif - -$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s - m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ - rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ -endif - -$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s - m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ - rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s -else ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ -endif - -$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s - m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ - rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s -else ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ -else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ -endif - -$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s - m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ - rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s -else ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ -else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ -endif - -$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s - m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ - rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s -else ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ -else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ -endif - -$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)xgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ - -$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ - - -ifdef USE_TRMM -$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s - m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ - rm strmmkernel_ln.s strmmkernel_ln_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif - -$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s - m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ - rm strmmkernel_lt.s strmmkernel_lt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif - -$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s - m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ - rm strmmkernel_rn.s strmmkernel_rn_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif - -$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ - rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s - m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ - rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s - m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ - rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s - m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ - rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s - m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ - rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s - m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ - rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s - m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ - rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s - m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ - rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s - m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ - rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s - m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ - rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s - m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ - rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s - m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ - rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -endif - -$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s - m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ - rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - -$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s - m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ - rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s - m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ - rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s - m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ - rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s - m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ - rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s - m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ - rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s - m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ - rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s - m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ - rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -endif - -$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s - m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ - rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s -else ifeq ($(CORE), SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif - -else -$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ - rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif - -$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif -$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ -endif -$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ -endif -$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ -endif -$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ -endif -$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ -endif -$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) -ifeq ($(CORE),SANDYBRIDGE) - $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ -endif -endif - - - - -$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)zgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)xgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)strsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s - m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ - rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s -else - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ -endif - -$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) - $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - - -ifdef STRMMUNCOPY_M -$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef STRMMLNCOPY_M -$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef STRMMUTCOPY_M -$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef STRMMLTCOPY_M -$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef DTRMMUNCOPY_M -$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef DTRMMLNCOPY_M -$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef DTRMMUTCOPY_M -$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef DTRMMLTCOPY_M -$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef CTRMMUNCOPY_M -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef CTRMMLNCOPY_M -$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef CTRMMUTCOPY_M -$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef CTRMMLTCOPY_M -$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef ZTRMMUNCOPY_M -$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRMMLNCOPY_M -$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef ZTRMMUTCOPY_M -$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRMMLTCOPY_M -$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -ifdef SSYMMUCOPY_M -$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ -else -$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ -endif - -ifdef SSYMMLCOPY_M -$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ -else -$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ -endif - -$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -ifdef DSYMMUCOPY_M -$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ -else -$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ -endif - -ifdef DSYMMLCOPY_M -$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ -else -$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ -endif - -$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)qsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)qsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)qsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -ifdef CSYMMUCOPY_M -$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ -else -$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ -endif - -ifdef CSYMMLCOPY_M -$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ -else -$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ -endif - -$(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -ifdef ZSYMMUCOPY_M -$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ -else -$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ -endif - -ifdef ZSYMMLCOPY_M -$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ -else -$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ -endif - -$(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)xsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)xsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)xsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -ifdef CHEMMUTCOPY_M -$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ -else -$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ -endif - -ifdef CHEMMLTCOPY_M -$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ -else -$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ -endif - -$(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -ifdef ZHEMMUTCOPY_M -$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ -else -$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ -endif - -ifdef ZHEMMLTCOPY_M -$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ -else -$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ -endif - -$(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)xhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -$(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ - -$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ - -$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -ifdef TRSMCOPYUN_M -$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYLN_M -$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYUT_M -$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYLT_M -$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef TRSMCOPYUN_M -$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYLN_M -$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYUT_M -$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef TRSMCOPYLT_M -$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef ZTRSMCOPYUN_M -$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYLN_M -$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYUT_M -$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYLT_M -$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -ifdef ZTRSMCOPYUN_M -$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYLN_M -$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYUT_M -$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -else -$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ -endif - -ifdef ZTRSMCOPYLT_M -$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -else -$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ -endif - -$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - - -$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -ifeq ($(BUILD_BFLOAT16),1) -$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif - -$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)qgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)cgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ - -$(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ - -$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ - - -ifeq ($(BUILD_BFLOAT16), 1) -$(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) -$(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -endif -endif - -$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(SGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMOTCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) - -$(SGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMINCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -endif - -$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -$(DGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMINCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(DGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMITCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -endif - -ifdef EXPRECISION - -$(QGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMONCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(QGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) - -$(QGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMINCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(QGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMITCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -endif - -endif - -$(CGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMONCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(CGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMOTCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) - -$(CGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMINCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(CGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMITCOPY) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -endif - -$(ZGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMONCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(ZGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) - -$(ZGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMINCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(ZGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMITCOPY) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -endif - -ifdef EXPRECISION - -$(XGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMONCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(XGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) - -$(XGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMINCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(XGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMITCOPY) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -endif - -endif - - -ifeq ($(BUILD_BFLOAT16), 1) -$(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) - $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif - -$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) - $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) - $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)qgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) - $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)cgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) - $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s - m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ - rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s -else - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ -endif - -$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ - -$(KDIR)zgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)zgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)zgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ - -$(KDIR)zgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ - -$(KDIR)xgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)xgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)xgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ - -$(KDIR)xgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ - -$(KDIR)strmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s - m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ - rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s -else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ -endif - -$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ - -$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ - -$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - -$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - -$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ - -$(KDIR)cgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)zgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)xgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)strsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)strsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ - -$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) - $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ - - -$(KDIR)strmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ssymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)ssymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)ssymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)ssymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)dsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)dsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)dsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)dsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)qsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)qsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)qsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)qsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)csymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)csymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)csymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)csymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)zsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)zsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)zsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)zsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)xsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ - -$(KDIR)xsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ - -$(KDIR)xsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)xsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ - -$(KDIR)chemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)chemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -$(KDIR)chemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ - -$(KDIR)chemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ - -$(KDIR)zhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)zhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -$(KDIR)zhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ - -$(KDIR)zhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ - -$(KDIR)xhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ - -$(KDIR)xhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ - -$(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ - -$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ - -$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ - -$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ - -$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ - -$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c - $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ - -$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)strsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)strsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)strsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)strsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c - $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ - - -##### BLAS extensions ###### - -ifndef DOMATCOPY_CN -DOMATCOPY_CN = ../arm/omatcopy_cn.c -endif - -$(KDIR)domatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef DOMATCOPY_RN -DOMATCOPY_RN = ../arm/omatcopy_rn.c -endif - -$(KDIR)domatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef DOMATCOPY_CT -DOMATCOPY_CT = ../arm/omatcopy_ct.c -endif - -$(KDIR)domatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef DOMATCOPY_RT -DOMATCOPY_RT = ../arm/omatcopy_rt.c -endif - -$(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef DIMATCOPY_CN -DIMATCOPY_CN = ../generic/imatcopy_cn.c -endif - -$(KDIR)dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef DIMATCOPY_RN -DIMATCOPY_RN = ../generic/imatcopy_rn.c -endif - -$(KDIR)dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef DIMATCOPY_CT -DIMATCOPY_CT = ../generic/imatcopy_ct.c -endif - -$(KDIR)dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef DIMATCOPY_RT -DIMATCOPY_RT = ../generic/imatcopy_rt.c -endif - -$(KDIR)dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef SOMATCOPY_CN -SOMATCOPY_CN = ../arm/omatcopy_cn.c -endif - -$(KDIR)somatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef SOMATCOPY_RN -SOMATCOPY_RN = ../arm/omatcopy_rn.c -endif - -$(KDIR)somatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef SOMATCOPY_CT -SOMATCOPY_CT = ../arm/omatcopy_ct.c -endif - -$(KDIR)somatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef SOMATCOPY_RT -SOMATCOPY_RT = ../arm/omatcopy_rt.c -endif - -$(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef SIMATCOPY_CN -SIMATCOPY_CN = ../generic/imatcopy_cn.c -endif - -$(KDIR)simatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef SIMATCOPY_RN -SIMATCOPY_RN = ../generic/imatcopy_rn.c -endif - -$(KDIR)simatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ - -ifndef SIMATCOPY_CT -SIMATCOPY_CT = ../generic/imatcopy_ct.c -endif - -$(KDIR)simatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef SIMATCOPY_RT -SIMATCOPY_RT = ../generic/imatcopy_rt.c -endif - -$(KDIR)simatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ - - -ifndef COMATCOPY_CN -COMATCOPY_CN = ../arm/zomatcopy_cn.c -endif - -$(KDIR)comatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef COMATCOPY_RN -COMATCOPY_RN = ../arm/zomatcopy_rn.c -endif - -$(KDIR)comatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef COMATCOPY_CT -COMATCOPY_CT = ../arm/zomatcopy_ct.c -endif - -$(KDIR)comatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef COMATCOPY_RT -COMATCOPY_RT = ../arm/zomatcopy_rt.c -endif - -$(KDIR)comatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef COMATCOPY_CNC -COMATCOPY_CNC = ../arm/zomatcopy_cnc.c -endif - -$(KDIR)comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CNC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef COMATCOPY_RNC -COMATCOPY_RNC = ../arm/zomatcopy_rnc.c -endif - -$(KDIR)comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RNC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef COMATCOPY_CTC -COMATCOPY_CTC = ../arm/zomatcopy_ctc.c -endif - -$(KDIR)comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CTC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef COMATCOPY_RTC -COMATCOPY_RTC = ../arm/zomatcopy_rtc.c -endif - -$(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef CIMATCOPY_CN -CIMATCOPY_CN = ../generic/zimatcopy_cn.c -endif - -$(KDIR)cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef CIMATCOPY_RN -CIMATCOPY_RN = ../generic/zimatcopy_rn.c -endif - -$(KDIR)cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef CIMATCOPY_CT -CIMATCOPY_CT = ../generic/zimatcopy_ct.c -endif - -$(KDIR)cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef CIMATCOPY_RT -CIMATCOPY_RT = ../generic/zimatcopy_rt.c -endif - -$(KDIR)cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef CIMATCOPY_CNC -CIMATCOPY_CNC = ../generic/zimatcopy_cnc.c -endif - -$(KDIR)cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CNC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef CIMATCOPY_RNC -CIMATCOPY_RNC = ../generic/zimatcopy_rnc.c -endif - -$(KDIR)cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RNC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef CIMATCOPY_CTC -CIMATCOPY_CTC = ../generic/zimatcopy_ctc.c -endif - -$(KDIR)cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CTC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef CIMATCOPY_RTC -CIMATCOPY_RTC = ../generic/zimatcopy_rtc.c -endif - -$(KDIR)cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RTC) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - - - -ifndef ZOMATCOPY_CN -ZOMATCOPY_CN = ../arm/zomatcopy_cn.c -endif - -$(KDIR)zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef ZOMATCOPY_RN -ZOMATCOPY_RN = ../arm/zomatcopy_rn.c -endif - -$(KDIR)zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef ZOMATCOPY_CT -ZOMATCOPY_CT = ../arm/zomatcopy_ct.c -endif - -$(KDIR)zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef ZOMATCOPY_RT -ZOMATCOPY_RT = ../arm/zomatcopy_rt.c -endif - -$(KDIR)zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef ZOMATCOPY_CNC -ZOMATCOPY_CNC = ../arm/zomatcopy_cnc.c -endif - -$(KDIR)zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CNC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef ZOMATCOPY_RNC -ZOMATCOPY_RNC = ../arm/zomatcopy_rnc.c -endif - -$(KDIR)zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RNC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef ZOMATCOPY_CTC -ZOMATCOPY_CTC = ../arm/zomatcopy_ctc.c -endif - -$(KDIR)zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CTC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef ZOMATCOPY_RTC -ZOMATCOPY_RTC = ../arm/zomatcopy_rtc.c -endif - -$(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef ZIMATCOPY_CN -ZIMATCOPY_CN = ../generic/zimatcopy_cn.c -endif - -$(KDIR)zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef ZIMATCOPY_RN -ZIMATCOPY_RN = ../generic/zimatcopy_rn.c -endif - -$(KDIR)zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef ZIMATCOPY_CT -ZIMATCOPY_CT = ../generic/zimatcopy_ct.c -endif - -$(KDIR)zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ - -ifndef ZIMATCOPY_RT -ZIMATCOPY_RT = ../generic/zimatcopy_rt.c -endif - -$(KDIR)zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ - -ifndef ZIMATCOPY_CNC -ZIMATCOPY_CNC = ../generic/zimatcopy_cnc.c -endif - -$(KDIR)zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CNC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef ZIMATCOPY_RNC -ZIMATCOPY_RNC = ../generic/zimatcopy_rnc.c -endif - -$(KDIR)zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RNC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - -ifndef ZIMATCOPY_CTC -ZIMATCOPY_CTC = ../generic/zimatcopy_ctc.c -endif - -$(KDIR)zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CTC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ - -ifndef ZIMATCOPY_RTC -ZIMATCOPY_RTC = ../generic/zimatcopy_rtc.c -endif - -$(KDIR)zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RTC) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ - - -ifndef SGEADD_K -SGEADD_K = ../generic/geadd.c -endif - -$(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef DGEADD_K -DGEADD_K = ../generic/geadd.c -endif - -$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEADD_K) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ - -ifndef CGEADD_K -CGEADD_K = ../generic/zgeadd.c -endif - -$(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@ - -ifndef ZGEADD_K -ZGEADD_K = ../generic/zgeadd.c -endif - -$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ - - - -###### BLAS small matrix optimization ##### - -ifndef DGEMM_SMALL_M_PERMIT -DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c -endif - -ifndef DGEMM_SMALL_K_NN -DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef DGEMM_SMALL_K_NT -DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef DGEMM_SMALL_K_TN -DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef DGEMM_SMALL_K_TT -DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ - -ifndef DGEMM_SMALL_K_B0_NN -DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef DGEMM_SMALL_K_B0_NT -DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef DGEMM_SMALL_K_B0_TN -DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef DGEMM_SMALL_K_B0_TT -DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ - -ifndef SGEMM_SMALL_M_PERMIT -SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c -endif - -ifndef SGEMM_SMALL_K_NN -SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef SGEMM_SMALL_K_NT -SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef SGEMM_SMALL_K_TN -SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef SGEMM_SMALL_K_TT -SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - -ifndef SGEMM_SMALL_K_B0_NN -SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef SGEMM_SMALL_K_B0_NT -SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef SGEMM_SMALL_K_B0_TN -SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef SGEMM_SMALL_K_B0_TT -SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - - -ifeq ($(BUILD_BFLOAT16), 1) -ifndef SBGEMM_SMALL_M_PERMIT -SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c -endif - -ifndef SBGEMM_SMALL_K_NN -SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef SBGEMM_SMALL_K_NT -SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef SBGEMM_SMALL_K_TN -SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef SBGEMM_SMALL_K_TT -SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ - -ifndef SBGEMM_SMALL_K_B0_NN -SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c -endif - -ifndef SBGEMM_SMALL_K_B0_NT -SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c -endif - -ifndef SBGEMM_SMALL_K_B0_TN -SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c -endif - -ifndef SBGEMM_SMALL_K_B0_TT -SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c -endif - -$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ - -$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ -endif - -ifndef CGEMM_SMALL_M_PERMIT -CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c -endif - -ifndef CGEMM_SMALL_K_NN -CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c -endif - -ifndef CGEMM_SMALL_K_NT -CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c -endif - -ifndef CGEMM_SMALL_K_TN -CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c -endif - -ifndef CGEMM_SMALL_K_TT -CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c -endif - -$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ - -$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ - -$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ - -$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ - -$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ - -$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ - -$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ - -$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@ - -$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ - -$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ - -$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@ - -$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ - -$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ - -$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ - -$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ - -ifndef CGEMM_SMALL_K_B0_NN -CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c -endif - -ifndef CGEMM_SMALL_K_B0_NT -CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c -endif - -ifndef CGEMM_SMALL_K_B0_TN -CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c -endif - -ifndef CGEMM_SMALL_K_B0_TT -CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c -endif - -$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ - -$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ - -ifndef ZGEMM_SMALL_M_PERMIT -ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c -endif - -ifndef ZGEMM_SMALL_K_NN -ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c -endif - -ifndef ZGEMM_SMALL_K_NT -ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c -endif - -ifndef ZGEMM_SMALL_K_TN -ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c -endif - -ifndef ZGEMM_SMALL_K_TT -ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c -endif - -$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ - - -$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ - -$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ - -$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ - -$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ - -$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ - -$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ - -$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ - -$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@ - -$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ - -$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ - -$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ - -$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@ - -$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ - -$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ - -$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ - -$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ - -ifndef ZGEMM_SMALL_K_B0_NN -ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c -endif - -ifndef ZGEMM_SMALL_K_B0_NT -ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c -endif - -ifndef ZGEMM_SMALL_K_B0_TN -ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c -endif - -ifndef ZGEMM_SMALL_K_B0_TT -ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c -endif - -$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ - -$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ From cd8eb83bae95f0ccc5308d1612b60d505847296e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 5 Nov 2023 18:13:37 +0100 Subject: [PATCH 383/718] Fix allocations and compiler warnings in ZROTG (#4289) * Clean up ZROTG --- interface/zrotg.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/interface/zrotg.c b/interface/zrotg.c index 4d2a9d510..ea73352dd 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -30,14 +30,12 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { FLOAT db_r = *(DB+0); FLOAT db_i = *(DB+1); //long double r; - FLOAT *r, *S1=(FLOAT *)malloc(2*sizeof(FLOAT)); - FLOAT *R=(FLOAT *)malloc(2*sizeof(FLOAT)); + FLOAT S1[2]; + FLOAT R[2]; long double d; FLOAT ada = da_r * da_r + da_i * da_i; FLOAT adb = db_r * db_r + db_i * db_i; - FLOAT adart = sqrt( da_r * da_r + da_i * da_i); - FLOAT adbrt = sqrt( db_r * db_r + db_i * db_i); PRINT_DEBUG_NAME; @@ -115,10 +113,13 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { } } else { *C = ada / adahsq; - if (*C >= safmin) + if (*C >= safmin) { *R = *DA / *C; - else + *(R+1) = *(DA+1) / *(C+1); + } else { *R = *DA * (h / adahsq); + *(R+1) = *(DA+1) * (h / adahsq); + } *S = *S1 * ada / adahsq; *(S+1) = *(S1+1) * ada / adahsq; } @@ -178,4 +179,4 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { } } } - \ No newline at end of file + From ac7efc61fd41dd4f93bcf471bc8687e96fa882ac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Nov 2023 00:03:33 +0100 Subject: [PATCH 384/718] Put more build information into Makefile.conf_last --- Makefile | 20 ++++++++++++++++---- Makefile.install | 8 ++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 8621a8b3f..5f6643402 100644 --- a/Makefile +++ b/Makefile @@ -35,11 +35,7 @@ export NO_LAPACK export C_LAPACK endif -ifeq ($(F_COMPILER),CRAY) -LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS)) -else LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) -endif SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test @@ -210,9 +206,25 @@ ifeq ($(DYNAMIC_OLDER), 1) @echo DYNAMIC_OLDER=1 >> Makefile.conf_last endif endif + @echo TARGET=$(CORE) >> Makefile.conf_last ifdef USE_THREAD @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last endif +ifdef SMP +ifdef NUM_THREADS + @echo NUM_THREADS=$(NUM_THREADS) >> Makefile.conf_last +else + @echo NUM_THREADS=$(NUM_CORES) >> Makefile.conf_last +endif +endif +ifeq ($(USE_OPENMP),1) + @echo USE_OPENMP=1 >> Makefile.conf_last +endif +ifeq ($(INTERFACE64),1) + @echo INTERFACE64=1 >> Makefile.conf_last +endif + @echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last + @echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @touch lib.grd diff --git a/Makefile.install b/Makefile.install index 01899b970..81f959177 100644 --- a/Makefile.install +++ b/Makefile.install @@ -3,6 +3,14 @@ export GOTOBLAS_MAKEFILE = 1 -include $(TOPDIR)/Makefile.conf_last include ./Makefile.system +ifdef THELIBNAME +LIBNAME=$(THELIBNAME) +LIBSONAME=$(THELIBSONAME) +endif +ifeq ($(INTERFACE64),1) +USE_64BITINT=1 +endif + PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include From cf8295da5ca2774a04fed14a89d7dc0de25f9146 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Nov 2023 21:22:26 +0100 Subject: [PATCH 385/718] Fix potential overflow in the calculation of MAXIT --- lapack-netlib/SRC/cbdsqr.f | 34 +++++++++++++++++++++++++--------- lapack-netlib/SRC/zbdsqr.f | 34 +++++++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/lapack-netlib/SRC/cbdsqr.f b/lapack-netlib/SRC/cbdsqr.f index 40706644e..cf1459ad2 100644 --- a/lapack-netlib/SRC/cbdsqr.f +++ b/lapack-netlib/SRC/cbdsqr.f @@ -204,6 +204,17 @@ *> algorithm through its inner loop. The algorithms stops *> (and so fails to converge) if the number of passes *> through the inner loop exceeds MAXITR*N**2. +*> +*> \endverbatim +* +*> \par Note: +* =========== +*> +*> \verbatim +*> Bug report from Cezary Dendek. +*> On November 3rd 2023, the INTEGER variable MAXIT = MAXITR*N**2 is +*> removed since it can overflow pretty easily (for N larger or equal +*> than 18,919). We instead use MAXITDIVN = MAXITR*N. *> \endverbatim * * Authors: @@ -214,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup bdsqr * * ===================================================================== SUBROUTINE CBDSQR( UPLO, N, NCVT, NRU, NCC, D, E, VT, LDVT, U, @@ -255,8 +266,8 @@ * .. * .. Local Scalars .. LOGICAL LOWER, ROTATE - INTEGER I, IDIR, ISUB, ITER, J, LL, LLL, M, MAXIT, NM1, - $ NM12, NM13, OLDLL, OLDM + INTEGER I, IDIR, ISUB, ITER, ITERDIVN, J, LL, LLL, M, + $ MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM REAL ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU, $ OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL, $ SINR, SLL, SMAX, SMIN, SMINOA, @@ -389,20 +400,21 @@ 40 CONTINUE 50 CONTINUE SMINOA = SMINOA / SQRT( REAL( N ) ) - THRESH = MAX( TOL*SMINOA, MAXITR*N*N*UNFL ) + THRESH = MAX( TOL*SMINOA, MAXITR*(N*(N*UNFL)) ) ELSE * * Absolute accuracy desired * - THRESH = MAX( ABS( TOL )*SMAX, MAXITR*N*N*UNFL ) + THRESH = MAX( ABS( TOL )*SMAX, MAXITR*(N*(N*UNFL)) ) END IF * * Prepare for main iteration loop for the singular values * (MAXIT is the maximum number of passes through the inner * loop permitted before nonconvergence signalled.) * - MAXIT = MAXITR*N*N - ITER = 0 + MAXITDIVN = MAXITR*N + ITERDIVN = 0 + ITER = -1 OLDLL = -1 OLDM = -1 * @@ -418,8 +430,12 @@ * IF( M.LE.1 ) $ GO TO 160 - IF( ITER.GT.MAXIT ) - $ GO TO 200 + IF( ITER.GE.N ) THEN + ITER = ITER - N + ITERDIVN = ITERDIVN + 1 + IF( ITERDIVN.GE.MAXITDIVN ) + $ GO TO 200 + END IF * * Find diagonal block of matrix to work on * diff --git a/lapack-netlib/SRC/zbdsqr.f b/lapack-netlib/SRC/zbdsqr.f index faedafc3c..865bb9dd5 100644 --- a/lapack-netlib/SRC/zbdsqr.f +++ b/lapack-netlib/SRC/zbdsqr.f @@ -204,6 +204,17 @@ *> algorithm through its inner loop. The algorithms stops *> (and so fails to converge) if the number of passes *> through the inner loop exceeds MAXITR*N**2. +*> +*> \endverbatim +* +*> \par Note: +* =========== +*> +*> \verbatim +*> Bug report from Cezary Dendek. +*> On November 3rd 2023, the INTEGER variable MAXIT = MAXITR*N**2 is +*> removed since it can overflow pretty easily (for N larger or equal +*> than 18,919). We instead use MAXITDIVN = MAXITR*N. *> \endverbatim * * Authors: @@ -214,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup bdsqr * * ===================================================================== SUBROUTINE ZBDSQR( UPLO, N, NCVT, NRU, NCC, D, E, VT, LDVT, U, @@ -255,8 +266,8 @@ * .. * .. Local Scalars .. LOGICAL LOWER, ROTATE - INTEGER I, IDIR, ISUB, ITER, J, LL, LLL, M, MAXIT, NM1, - $ NM12, NM13, OLDLL, OLDM + INTEGER I, IDIR, ISUB, ITER, ITERDIVN, J, LL, LLL, M, + $ MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM DOUBLE PRECISION ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU, $ OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL, $ SINR, SLL, SMAX, SMIN, SMINOA, @@ -389,20 +400,21 @@ 40 CONTINUE 50 CONTINUE SMINOA = SMINOA / SQRT( DBLE( N ) ) - THRESH = MAX( TOL*SMINOA, MAXITR*N*N*UNFL ) + THRESH = MAX( TOL*SMINOA, MAXITR*(N*(N*UNFL)) ) ELSE * * Absolute accuracy desired * - THRESH = MAX( ABS( TOL )*SMAX, MAXITR*N*N*UNFL ) + THRESH = MAX( ABS( TOL )*SMAX, MAXITR*(N*(N*UNFL)) ) END IF * * Prepare for main iteration loop for the singular values * (MAXIT is the maximum number of passes through the inner * loop permitted before nonconvergence signalled.) * - MAXIT = MAXITR*N*N - ITER = 0 + MAXITDIVN = MAXITR*N + ITERDIVN = 0 + ITER = -1 OLDLL = -1 OLDM = -1 * @@ -418,8 +430,12 @@ * IF( M.LE.1 ) $ GO TO 160 - IF( ITER.GT.MAXIT ) - $ GO TO 200 + IF( ITER.GE.N ) THEN + ITER = ITER - N + ITERDIVN = ITERDIVN + 1 + IF( ITERDIVN.GE.MAXITDIVN ) + $ GO TO 200 + END IF * * Find diagonal block of matrix to work on * From f6ec777701d5d5e8271c9c271e260873c9b05911 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Nov 2023 21:40:50 +0100 Subject: [PATCH 386/718] Fix accumulation (Reference-LAPACK PR 909) --- lapack-netlib/SRC/classq.f90 | 81 ++++++++++++++++++------------------ lapack-netlib/SRC/dlassq.f90 | 81 ++++++++++++++++++------------------ lapack-netlib/SRC/slassq.f90 | 81 ++++++++++++++++++------------------ lapack-netlib/SRC/zlassq.f90 | 81 ++++++++++++++++++------------------ 4 files changed, 160 insertions(+), 164 deletions(-) diff --git a/lapack-netlib/SRC/classq.f90 b/lapack-netlib/SRC/classq.f90 index cb4e7971f..c5f793cc0 100644 --- a/lapack-netlib/SRC/classq.f90 +++ b/lapack-netlib/SRC/classq.f90 @@ -34,28 +34,15 @@ !> !> \verbatim !> -!> CLASSQ returns the values scl and smsq such that +!> CLASSQ returns the values scale_out and sumsq_out such that !> -!> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, +!> (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq, !> -!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is +!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is !> assumed to be non-negative. !> !> scale and sumsq must be supplied in SCALE and SUMSQ and -!> scl and smsq are overwritten on SCALE and SUMSQ respectively. -!> -!> If scale * sqrt( sumsq ) > tbig then -!> we require: scale >= sqrt( TINY*EPS ) / sbig on entry, -!> and if 0 < scale * sqrt( sumsq ) < tsml then -!> we require: scale <= sqrt( HUGE ) / ssml on entry, -!> where -!> tbig -- upper threshold for values whose square is representable; -!> sbig -- scaling constant for big numbers; \see la_constants.f90 -!> tsml -- lower threshold for values whose square is representable; -!> ssml -- scaling constant for small numbers; \see la_constants.f90 -!> and -!> TINY*EPS -- tiniest representable number; -!> HUGE -- biggest representable number. +!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively. !> !> \endverbatim ! @@ -72,7 +59,7 @@ !> \verbatim !> X is COMPLEX array, dimension (1+(N-1)*abs(INCX)) !> The vector for which a scaled sum of squares is computed. -!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. +!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. !> \endverbatim !> !> \param[in] INCX @@ -82,24 +69,24 @@ !> If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n !> If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n !> If INCX = 0, x isn't a vector so there is no need to call -!> this subroutine. If you call it anyway, it will count x(1) +!> this subroutine. If you call it anyway, it will count x(1) !> in the vector norm N times. !> \endverbatim !> !> \param[in,out] SCALE !> \verbatim !> SCALE is REAL -!> On entry, the value scale in the equation above. -!> On exit, SCALE is overwritten with scl , the scaling factor +!> On entry, the value scale in the equation above. +!> On exit, SCALE is overwritten by scale_out, the scaling factor !> for the sum of squares. !> \endverbatim !> !> \param[in,out] SUMSQ !> \verbatim !> SUMSQ is REAL -!> On entry, the value sumsq in the equation above. -!> On exit, SUMSQ is overwritten with smsq , the basic sum of -!> squares from which scl has been factored out. +!> On entry, the value sumsq in the equation above. +!> On exit, SUMSQ is overwritten by sumsq_out, the basic sum of +!> squares from which scale_out has been factored out. !> \endverbatim ! ! Authors: @@ -130,10 +117,10 @@ !> !> \endverbatim ! -!> \ingroup OTHERauxiliary +!> \ingroup lassq ! ! ===================================================================== -subroutine CLASSQ( n, x, incx, scl, sumsq ) +subroutine CLASSQ( n, x, incx, scale, sumsq ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, one=>sone, & sbig=>ssbig, ssml=>sssml, tbig=>stbig, tsml=>stsml @@ -145,7 +132,7 @@ subroutine CLASSQ( n, x, incx, scl, sumsq ) ! ! .. Scalar Arguments .. integer :: incx, n - real(wp) :: scl, sumsq + real(wp) :: scale, sumsq ! .. ! .. Array Arguments .. complex(wp) :: x(*) @@ -158,10 +145,10 @@ subroutine CLASSQ( n, x, incx, scl, sumsq ) ! ! Quick return if possible ! - if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return - if( sumsq == zero ) scl = one - if( scl == zero ) then - scl = one + if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return + if( sumsq == zero ) scale = one + if( scale == zero ) then + scale = one sumsq = zero end if if (n <= 0) then @@ -207,15 +194,27 @@ subroutine CLASSQ( n, x, incx, scl, sumsq ) ! Put the existing sum of squares into one of the accumulators ! if( sumsq > zero ) then - ax = scl*sqrt( sumsq ) + ax = scale*sqrt( sumsq ) if (ax > tbig) then -! We assume scl >= sqrt( TINY*EPS ) / sbig - abig = abig + (scl*sbig)**2 * sumsq + if (scale > one) then + scale = scale * sbig + abig = abig + scale * (scale * sumsq) + else + ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable + abig = abig + scale * (scale * (sbig * (sbig * sumsq))) + end if else if (ax < tsml) then -! We assume scl <= sqrt( HUGE ) / ssml - if (notbig) asml = asml + (scl*ssml)**2 * sumsq + if (notbig) then + if (scale < one) then + scale = scale * ssml + asml = asml + scale * (scale * sumsq) + else + ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable + asml = asml + scale * (scale * (ssml * (ssml * sumsq))) + end if + end if else - amed = amed + scl**2 * sumsq + amed = amed + scale * (scale * sumsq) end if end if ! @@ -229,7 +228,7 @@ subroutine CLASSQ( n, x, incx, scl, sumsq ) if (amed > zero .or. LA_ISNAN(amed)) then abig = abig + (amed*sbig)*sbig end if - scl = one / sbig + scale = one / sbig sumsq = abig else if (asml > zero) then ! @@ -245,17 +244,17 @@ subroutine CLASSQ( n, x, incx, scl, sumsq ) ymin = asml ymax = amed end if - scl = one + scale = one sumsq = ymax**2*( one + (ymin/ymax)**2 ) else - scl = one / ssml + scale = one / ssml sumsq = asml end if else ! ! Otherwise all values are mid-range or zero ! - scl = one + scale = one sumsq = amed end if return diff --git a/lapack-netlib/SRC/dlassq.f90 b/lapack-netlib/SRC/dlassq.f90 index fddd1bf38..37626844b 100644 --- a/lapack-netlib/SRC/dlassq.f90 +++ b/lapack-netlib/SRC/dlassq.f90 @@ -34,28 +34,15 @@ !> !> \verbatim !> -!> DLASSQ returns the values scl and smsq such that +!> DLASSQ returns the values scale_out and sumsq_out such that !> -!> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, +!> (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq, !> -!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is +!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is !> assumed to be non-negative. !> !> scale and sumsq must be supplied in SCALE and SUMSQ and -!> scl and smsq are overwritten on SCALE and SUMSQ respectively. -!> -!> If scale * sqrt( sumsq ) > tbig then -!> we require: scale >= sqrt( TINY*EPS ) / sbig on entry, -!> and if 0 < scale * sqrt( sumsq ) < tsml then -!> we require: scale <= sqrt( HUGE ) / ssml on entry, -!> where -!> tbig -- upper threshold for values whose square is representable; -!> sbig -- scaling constant for big numbers; \see la_constants.f90 -!> tsml -- lower threshold for values whose square is representable; -!> ssml -- scaling constant for small numbers; \see la_constants.f90 -!> and -!> TINY*EPS -- tiniest representable number; -!> HUGE -- biggest representable number. +!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively. !> !> \endverbatim ! @@ -72,7 +59,7 @@ !> \verbatim !> X is DOUBLE PRECISION array, dimension (1+(N-1)*abs(INCX)) !> The vector for which a scaled sum of squares is computed. -!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. +!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. !> \endverbatim !> !> \param[in] INCX @@ -82,24 +69,24 @@ !> If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n !> If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n !> If INCX = 0, x isn't a vector so there is no need to call -!> this subroutine. If you call it anyway, it will count x(1) +!> this subroutine. If you call it anyway, it will count x(1) !> in the vector norm N times. !> \endverbatim !> !> \param[in,out] SCALE !> \verbatim !> SCALE is DOUBLE PRECISION -!> On entry, the value scale in the equation above. -!> On exit, SCALE is overwritten with scl , the scaling factor +!> On entry, the value scale in the equation above. +!> On exit, SCALE is overwritten by scale_out, the scaling factor !> for the sum of squares. !> \endverbatim !> !> \param[in,out] SUMSQ !> \verbatim !> SUMSQ is DOUBLE PRECISION -!> On entry, the value sumsq in the equation above. -!> On exit, SUMSQ is overwritten with smsq , the basic sum of -!> squares from which scl has been factored out. +!> On entry, the value sumsq in the equation above. +!> On exit, SUMSQ is overwritten by sumsq_out, the basic sum of +!> squares from which scale_out has been factored out. !> \endverbatim ! ! Authors: @@ -130,10 +117,10 @@ !> !> \endverbatim ! -!> \ingroup OTHERauxiliary +!> \ingroup lassq ! ! ===================================================================== -subroutine DLASSQ( n, x, incx, scl, sumsq ) +subroutine DLASSQ( n, x, incx, scale, sumsq ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, one=>done, & sbig=>dsbig, ssml=>dssml, tbig=>dtbig, tsml=>dtsml @@ -145,7 +132,7 @@ subroutine DLASSQ( n, x, incx, scl, sumsq ) ! ! .. Scalar Arguments .. integer :: incx, n - real(wp) :: scl, sumsq + real(wp) :: scale, sumsq ! .. ! .. Array Arguments .. real(wp) :: x(*) @@ -158,10 +145,10 @@ subroutine DLASSQ( n, x, incx, scl, sumsq ) ! ! Quick return if possible ! - if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return - if( sumsq == zero ) scl = one - if( scl == zero ) then - scl = one + if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return + if( sumsq == zero ) scale = one + if( scale == zero ) then + scale = one sumsq = zero end if if (n <= 0) then @@ -198,15 +185,27 @@ subroutine DLASSQ( n, x, incx, scl, sumsq ) ! Put the existing sum of squares into one of the accumulators ! if( sumsq > zero ) then - ax = scl*sqrt( sumsq ) + ax = scale*sqrt( sumsq ) if (ax > tbig) then -! We assume scl >= sqrt( TINY*EPS ) / sbig - abig = abig + (scl*sbig)**2 * sumsq + if (scale > one) then + scale = scale * sbig + abig = abig + scale * (scale * sumsq) + else + ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable + abig = abig + scale * (scale * (sbig * (sbig * sumsq))) + end if else if (ax < tsml) then -! We assume scl <= sqrt( HUGE ) / ssml - if (notbig) asml = asml + (scl*ssml)**2 * sumsq + if (notbig) then + if (scale < one) then + scale = scale * ssml + asml = asml + scale * (scale * sumsq) + else + ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable + asml = asml + scale * (scale * (ssml * (ssml * sumsq))) + end if + end if else - amed = amed + scl**2 * sumsq + amed = amed + scale * (scale * sumsq) end if end if ! @@ -220,7 +219,7 @@ subroutine DLASSQ( n, x, incx, scl, sumsq ) if (amed > zero .or. LA_ISNAN(amed)) then abig = abig + (amed*sbig)*sbig end if - scl = one / sbig + scale = one / sbig sumsq = abig else if (asml > zero) then ! @@ -236,17 +235,17 @@ subroutine DLASSQ( n, x, incx, scl, sumsq ) ymin = asml ymax = amed end if - scl = one + scale = one sumsq = ymax**2*( one + (ymin/ymax)**2 ) else - scl = one / ssml + scale = one / ssml sumsq = asml end if else ! ! Otherwise all values are mid-range or zero ! - scl = one + scale = one sumsq = amed end if return diff --git a/lapack-netlib/SRC/slassq.f90 b/lapack-netlib/SRC/slassq.f90 index 19f49402b..c8959f4a7 100644 --- a/lapack-netlib/SRC/slassq.f90 +++ b/lapack-netlib/SRC/slassq.f90 @@ -34,28 +34,15 @@ !> !> \verbatim !> -!> SLASSQ returns the values scl and smsq such that +!> SLASSQ returns the values scale_out and sumsq_out such that !> -!> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, +!> (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq, !> -!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is +!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is !> assumed to be non-negative. !> !> scale and sumsq must be supplied in SCALE and SUMSQ and -!> scl and smsq are overwritten on SCALE and SUMSQ respectively. -!> -!> If scale * sqrt( sumsq ) > tbig then -!> we require: scale >= sqrt( TINY*EPS ) / sbig on entry, -!> and if 0 < scale * sqrt( sumsq ) < tsml then -!> we require: scale <= sqrt( HUGE ) / ssml on entry, -!> where -!> tbig -- upper threshold for values whose square is representable; -!> sbig -- scaling constant for big numbers; \see la_constants.f90 -!> tsml -- lower threshold for values whose square is representable; -!> ssml -- scaling constant for small numbers; \see la_constants.f90 -!> and -!> TINY*EPS -- tiniest representable number; -!> HUGE -- biggest representable number. +!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively. !> !> \endverbatim ! @@ -72,7 +59,7 @@ !> \verbatim !> X is REAL array, dimension (1+(N-1)*abs(INCX)) !> The vector for which a scaled sum of squares is computed. -!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. +!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. !> \endverbatim !> !> \param[in] INCX @@ -82,24 +69,24 @@ !> If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n !> If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n !> If INCX = 0, x isn't a vector so there is no need to call -!> this subroutine. If you call it anyway, it will count x(1) +!> this subroutine. If you call it anyway, it will count x(1) !> in the vector norm N times. !> \endverbatim !> !> \param[in,out] SCALE !> \verbatim !> SCALE is REAL -!> On entry, the value scale in the equation above. -!> On exit, SCALE is overwritten with scl , the scaling factor +!> On entry, the value scale in the equation above. +!> On exit, SCALE is overwritten by scale_out, the scaling factor !> for the sum of squares. !> \endverbatim !> !> \param[in,out] SUMSQ !> \verbatim !> SUMSQ is REAL -!> On entry, the value sumsq in the equation above. -!> On exit, SUMSQ is overwritten with smsq , the basic sum of -!> squares from which scl has been factored out. +!> On entry, the value sumsq in the equation above. +!> On exit, SUMSQ is overwritten by sumsq_out, the basic sum of +!> squares from which scale_out has been factored out. !> \endverbatim ! ! Authors: @@ -130,10 +117,10 @@ !> !> \endverbatim ! -!> \ingroup OTHERauxiliary +!> \ingroup lassq ! ! ===================================================================== -subroutine SLASSQ( n, x, incx, scl, sumsq ) +subroutine SLASSQ( n, x, incx, scale, sumsq ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, one=>sone, & sbig=>ssbig, ssml=>sssml, tbig=>stbig, tsml=>stsml @@ -145,7 +132,7 @@ subroutine SLASSQ( n, x, incx, scl, sumsq ) ! ! .. Scalar Arguments .. integer :: incx, n - real(wp) :: scl, sumsq + real(wp) :: scale, sumsq ! .. ! .. Array Arguments .. real(wp) :: x(*) @@ -158,10 +145,10 @@ subroutine SLASSQ( n, x, incx, scl, sumsq ) ! ! Quick return if possible ! - if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return - if( sumsq == zero ) scl = one - if( scl == zero ) then - scl = one + if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return + if( sumsq == zero ) scale = one + if( scale == zero ) then + scale = one sumsq = zero end if if (n <= 0) then @@ -198,15 +185,27 @@ subroutine SLASSQ( n, x, incx, scl, sumsq ) ! Put the existing sum of squares into one of the accumulators ! if( sumsq > zero ) then - ax = scl*sqrt( sumsq ) + ax = scale*sqrt( sumsq ) if (ax > tbig) then -! We assume scl >= sqrt( TINY*EPS ) / sbig - abig = abig + (scl*sbig)**2 * sumsq + if (scale > one) then + scale = scale * sbig + abig = abig + scale * (scale * sumsq) + else + ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable + abig = abig + scale * (scale * (sbig * (sbig * sumsq))) + end if else if (ax < tsml) then -! We assume scl <= sqrt( HUGE ) / ssml - if (notbig) asml = asml + (scl*ssml)**2 * sumsq + if (notbig) then + if (scale < one) then + scale = scale * ssml + asml = asml + scale * (scale * sumsq) + else + ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable + asml = asml + scale * (scale * (ssml * (ssml * sumsq))) + end if + end if else - amed = amed + scl**2 * sumsq + amed = amed + scale * (scale * sumsq) end if end if ! @@ -220,7 +219,7 @@ subroutine SLASSQ( n, x, incx, scl, sumsq ) if (amed > zero .or. LA_ISNAN(amed)) then abig = abig + (amed*sbig)*sbig end if - scl = one / sbig + scale = one / sbig sumsq = abig else if (asml > zero) then ! @@ -236,17 +235,17 @@ subroutine SLASSQ( n, x, incx, scl, sumsq ) ymin = asml ymax = amed end if - scl = one + scale = one sumsq = ymax**2*( one + (ymin/ymax)**2 ) else - scl = one / ssml + scale = one / ssml sumsq = asml end if else ! ! Otherwise all values are mid-range or zero ! - scl = one + scale = one sumsq = amed end if return diff --git a/lapack-netlib/SRC/zlassq.f90 b/lapack-netlib/SRC/zlassq.f90 index 9346dacac..c35214766 100644 --- a/lapack-netlib/SRC/zlassq.f90 +++ b/lapack-netlib/SRC/zlassq.f90 @@ -34,28 +34,15 @@ !> !> \verbatim !> -!> ZLASSQ returns the values scl and smsq such that +!> ZLASSQ returns the values scale_out and sumsq_out such that !> -!> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq, +!> (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq, !> -!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is +!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is !> assumed to be non-negative. !> !> scale and sumsq must be supplied in SCALE and SUMSQ and -!> scl and smsq are overwritten on SCALE and SUMSQ respectively. -!> -!> If scale * sqrt( sumsq ) > tbig then -!> we require: scale >= sqrt( TINY*EPS ) / sbig on entry, -!> and if 0 < scale * sqrt( sumsq ) < tsml then -!> we require: scale <= sqrt( HUGE ) / ssml on entry, -!> where -!> tbig -- upper threshold for values whose square is representable; -!> sbig -- scaling constant for big numbers; \see la_constants.f90 -!> tsml -- lower threshold for values whose square is representable; -!> ssml -- scaling constant for small numbers; \see la_constants.f90 -!> and -!> TINY*EPS -- tiniest representable number; -!> HUGE -- biggest representable number. +!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively. !> !> \endverbatim ! @@ -72,7 +59,7 @@ !> \verbatim !> X is DOUBLE COMPLEX array, dimension (1+(N-1)*abs(INCX)) !> The vector for which a scaled sum of squares is computed. -!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. +!> x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n. !> \endverbatim !> !> \param[in] INCX @@ -82,24 +69,24 @@ !> If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n !> If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n !> If INCX = 0, x isn't a vector so there is no need to call -!> this subroutine. If you call it anyway, it will count x(1) +!> this subroutine. If you call it anyway, it will count x(1) !> in the vector norm N times. !> \endverbatim !> !> \param[in,out] SCALE !> \verbatim !> SCALE is DOUBLE PRECISION -!> On entry, the value scale in the equation above. -!> On exit, SCALE is overwritten with scl , the scaling factor +!> On entry, the value scale in the equation above. +!> On exit, SCALE is overwritten by scale_out, the scaling factor !> for the sum of squares. !> \endverbatim !> !> \param[in,out] SUMSQ !> \verbatim !> SUMSQ is DOUBLE PRECISION -!> On entry, the value sumsq in the equation above. -!> On exit, SUMSQ is overwritten with smsq , the basic sum of -!> squares from which scl has been factored out. +!> On entry, the value sumsq in the equation above. +!> On exit, SUMSQ is overwritten by sumsq_out, the basic sum of +!> squares from which scale_out has been factored out. !> \endverbatim ! ! Authors: @@ -130,10 +117,10 @@ !> !> \endverbatim ! -!> \ingroup OTHERauxiliary +!> \ingroup lassq ! ! ===================================================================== -subroutine ZLASSQ( n, x, incx, scl, sumsq ) +subroutine ZLASSQ( n, x, incx, scale, sumsq ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, one=>done, & sbig=>dsbig, ssml=>dssml, tbig=>dtbig, tsml=>dtsml @@ -145,7 +132,7 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq ) ! ! .. Scalar Arguments .. integer :: incx, n - real(wp) :: scl, sumsq + real(wp) :: scale, sumsq ! .. ! .. Array Arguments .. complex(wp) :: x(*) @@ -158,10 +145,10 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq ) ! ! Quick return if possible ! - if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return - if( sumsq == zero ) scl = one - if( scl == zero ) then - scl = one + if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return + if( sumsq == zero ) scale = one + if( scale == zero ) then + scale = one sumsq = zero end if if (n <= 0) then @@ -207,15 +194,27 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq ) ! Put the existing sum of squares into one of the accumulators ! if( sumsq > zero ) then - ax = scl*sqrt( sumsq ) + ax = scale*sqrt( sumsq ) if (ax > tbig) then -! We assume scl >= sqrt( TINY*EPS ) / sbig - abig = abig + (scl*sbig)**2 * sumsq + if (scale > one) then + scale = scale * sbig + abig = abig + scale * (scale * sumsq) + else + ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable + abig = abig + scale * (scale * (sbig * (sbig * sumsq))) + end if else if (ax < tsml) then -! We assume scl <= sqrt( HUGE ) / ssml - if (notbig) asml = asml + (scl*ssml)**2 * sumsq + if (notbig) then + if (scale < one) then + scale = scale * ssml + asml = asml + scale * (scale * sumsq) + else + ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable + asml = asml + scale * (scale * (ssml * (ssml * sumsq))) + end if + end if else - amed = amed + scl**2 * sumsq + amed = amed + scale * (scale * sumsq) end if end if ! @@ -229,7 +228,7 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq ) if (amed > zero .or. LA_ISNAN(amed)) then abig = abig + (amed*sbig)*sbig end if - scl = one / sbig + scale = one / sbig sumsq = abig else if (asml > zero) then ! @@ -245,17 +244,17 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq ) ymin = asml ymax = amed end if - scl = one + scale = one sumsq = ymax**2*( one + (ymin/ymax)**2 ) else - scl = one / ssml + scale = one / ssml sumsq = asml end if else ! ! Otherwise all values are mid-range or zero ! - scl = one + scale = one sumsq = amed end if return From 176cc6348ed3c0415391c42478301430e9bbe031 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Nov 2023 21:55:19 +0100 Subject: [PATCH 387/718] Correct constant ALPHA to match changed algorithm (Reference-LAPACK PR 928) --- lapack-netlib/SRC/cunbdb6.f | 4 ++-- lapack-netlib/SRC/dorbdb6.f | 4 ++-- lapack-netlib/SRC/sorbdb6.f | 4 ++-- lapack-netlib/SRC/zunbdb6.f | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/cunbdb6.f b/lapack-netlib/SRC/cunbdb6.f index b93a389d6..cd14d9295 100644 --- a/lapack-netlib/SRC/cunbdb6.f +++ b/lapack-netlib/SRC/cunbdb6.f @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb6 * * ===================================================================== SUBROUTINE CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -174,7 +174,7 @@ * * .. Parameters .. REAL ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.01E0, REALONE = 1.0E0, + PARAMETER ( ALPHA = 0.1E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) COMPLEX NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0E0,0.0E0), ONE = (1.0E0,0.0E0), diff --git a/lapack-netlib/SRC/dorbdb6.f b/lapack-netlib/SRC/dorbdb6.f index 45c8ba8a2..142887684 100644 --- a/lapack-netlib/SRC/dorbdb6.f +++ b/lapack-netlib/SRC/dorbdb6.f @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup unbdb6 * * ===================================================================== SUBROUTINE DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -174,7 +174,7 @@ * * .. Parameters .. DOUBLE PRECISION ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.01D0, REALONE = 1.0D0, + PARAMETER ( ALPHA = 0.1D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) DOUBLE PRECISION NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0D0, ONE = 1.0D0, ZERO = 0.0D0 ) diff --git a/lapack-netlib/SRC/sorbdb6.f b/lapack-netlib/SRC/sorbdb6.f index b2449e3be..d320c9e46 100644 --- a/lapack-netlib/SRC/sorbdb6.f +++ b/lapack-netlib/SRC/sorbdb6.f @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unbdb6 * * ===================================================================== SUBROUTINE SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -174,7 +174,7 @@ * * .. Parameters .. REAL ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.01E0, REALONE = 1.0E0, + PARAMETER ( ALPHA = 0.1E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) REAL NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0E0, ONE = 1.0E0, ZERO = 0.0E0 ) diff --git a/lapack-netlib/SRC/zunbdb6.f b/lapack-netlib/SRC/zunbdb6.f index ed666e449..ac7fa4be3 100644 --- a/lapack-netlib/SRC/zunbdb6.f +++ b/lapack-netlib/SRC/zunbdb6.f @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup unbdb6 * * ===================================================================== SUBROUTINE ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -174,7 +174,7 @@ * * .. Parameters .. DOUBLE PRECISION ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.01D0, REALONE = 1.0D0, + PARAMETER ( ALPHA = 0.1D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) COMPLEX*16 NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0D0,0.0D0), ONE = (1.0D0,0.0D0), From 206e76187ed1e7e78d3f93258eb90fa753a9d1d9 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 7 Nov 2023 18:08:57 -0600 Subject: [PATCH 388/718] Fix FCOMMON_OPT for power. Error out for certain C and Fortran compiler combos in AIX. --- Makefile.power | 23 +++++++++++++++-------- Makefile.system | 4 ++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/Makefile.power b/Makefile.power index ada51b2e1..95bada34f 100644 --- a/Makefile.power +++ b/Makefile.power @@ -13,9 +13,9 @@ ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math ifeq ($(F_COMPILER), IBM) -FCOMMON_OPT += -O2 -qrecur -qnosave +FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize else -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif endif endif @@ -38,9 +38,9 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) ifeq ($(F_COMPILER), IBM) -FCOMMON_OPT += -O2 -qrecur -qnosave +FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr9 -qtune=pwr9 -qfloat=nomaf -qzerosize else -FCOMMON_OPT += -O2 -frecursive -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -fno-fast-math -mcpu=power9 -mtune=power9 endif ifeq ($(F_COMPILER), GFORTRAN) @@ -65,15 +65,15 @@ endif ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) ifeq ($(F_COMPILER), IBM) -FCOMMON_OPT += -O2 -qrecur -qnosave +FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr8 -qtune=pwr8 -qfloat=nomaf -qzerosize else -FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif else ifeq ($(F_COMPILER), IBM) -FCOMMON_OPT += -O2 -qrecur -qnosave +FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr8 -qtune=pwr8 -qfloat=nomaf -qzerosize else -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif endif else @@ -135,6 +135,13 @@ endif ifdef BINARY64 +ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), GCCIBMAIX) +$(error Using GCC and XLF on AIX is not a supported combination.) +endif +ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), CLANGGFORTRANAIX) +$(error Using Clang and gFortran on AIX is not a supported combination.) +endif + ifeq ($(OSNAME), AIX) ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mpowerpc64 -maix64 diff --git a/Makefile.system b/Makefile.system index 30b0ddec2..cb19dea73 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1374,6 +1374,8 @@ ifeq ($(F_COMPILER), SUN) FCOMMON_OPT += -pic else ifeq ($(F_COMPILER), NAG) FCOMMON_OPT += -PIC +else ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -qpic=large else FCOMMON_OPT += -fPIC endif @@ -1626,9 +1628,11 @@ override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) ifeq ($(NEED_PIC), 1) ifeq (,$(findstring PIC,$(FFLAGS))) +ifneq ($(F_COMPILER),IBM) override FFLAGS += -fPIC endif endif +endif #For LAPACK Fortran codes. #Disable -fopenmp for LAPACK Fortran codes on Windows. From 5e31c5708393b4e086052c0e7b8cbc57d7ebede3 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 7 Nov 2023 20:58:34 -0600 Subject: [PATCH 389/718] Only define __builtin_cpu_is and __builtin_cpu_supports if not present. --- driver/others/dynamic_power.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 10a5d64b3..570a87568 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -202,6 +202,7 @@ static int cpuid(void) #ifndef __BUILTIN_CPU_SUPPORTS__ #include +#if defined(__has_builtin) && !__has_builtin(__builtin_cpu_is) static int __builtin_cpu_is(const char *arg) { static int ipinfo = -1; @@ -224,12 +225,15 @@ static int __builtin_cpu_is(const char *arg) } return 0; } +#endif +#if defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports) static int __builtin_cpu_supports(const char *arg) { return 0; } #endif +#endif static gotoblas_t *get_coretype(void) { From 4eecccd49b251be2cb303b67093c4602afb39aec Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 8 Nov 2023 07:12:21 -0600 Subject: [PATCH 390/718] Fix __builtin_cpu_is for AIX. --- driver/others/dynamic_power.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 570a87568..f0faf2baf 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -202,7 +202,7 @@ static int cpuid(void) #ifndef __BUILTIN_CPU_SUPPORTS__ #include -#if defined(__has_builtin) && !__has_builtin(__builtin_cpu_is) +#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_is)) static int __builtin_cpu_is(const char *arg) { static int ipinfo = -1; @@ -227,7 +227,7 @@ static int __builtin_cpu_is(const char *arg) } #endif -#if defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports) +#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports)) static int __builtin_cpu_supports(const char *arg) { return 0; From 778e3b746a7217bbafa099133f956753e8355c4b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 10 Nov 2023 11:55:29 +0100 Subject: [PATCH 391/718] Enable autodetection of current AMD cpus as their AVX512 Intel counterparts --- cpuid_x86.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index fdcead8bd..6cf4d6503 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1660,7 +1660,13 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; } - case 10: // Zen3 + case 10: // Zen3/4 +#ifndef NO_AVX512 + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; +#endif if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; @@ -2438,6 +2444,12 @@ int get_coretype(void){ // Ryzen 2 default: // Matisse,Renoir Ryzen2 models +#ifndef NO_AVX512 + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + if(support_avx512()) + return CORE_SKYLAKEX; +#endif if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; From 3ad27007fc71d5acb42c24569792146331b0cba3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 10 Nov 2023 15:19:11 +0100 Subject: [PATCH 392/718] rebase --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 5f6643402..b344abcd2 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,11 @@ export NO_LAPACK export C_LAPACK endif +ifeq ($(F_COMPILER),CRAY) +LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS)) +else LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) +endif SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test From 1a308a006664b20ae9dbce5e3e69d52e09b44829 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 10 Nov 2023 15:27:46 +0100 Subject: [PATCH 393/718] Move OpenMP dependency handling for clang/gfortran combo --- Makefile.system | 3 +++ f_check | 7 ------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/Makefile.system b/Makefile.system index 30b0ddec2..c6c1dc02f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -608,6 +608,9 @@ endif ifeq ($(C_COMPILER), CLANG) CCOMMON_OPT += -fopenmp +ifeq ($(F_COMPILER), GFORTRAN) +FEXTRALIB := $(subst -lgomp,-lomp,$(FEXTRALIB)) +endif endif ifeq ($(C_COMPILER), INTEL) diff --git a/f_check b/f_check index 31f4376d0..dac34edee 100755 --- a/f_check +++ b/f_check @@ -373,13 +373,6 @@ if [ -n "$link" ]; then ;; esac - case "$flag" in *-lgomp*) - case "$CC" in *clang*) - flag="-lomp" - ;; - esac - esac - case "$flag" in -l*) case "$flag" in *ibrary*|*gfortranbegin*|*flangmain*|*frtbegin*|*pathfstart*|\ From 1d4ed20c2f47994cba88432de2c5a5ea3c1db58e Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Fri, 10 Nov 2023 15:58:30 +0100 Subject: [PATCH 394/718] Add conditions to all CI jobs to only run on main repo by default This is a follow-up to gh-4271. At the moment, when a contributor pushes the latest `develop` to their own branch to bring their own fork in sync with `main`, or if they push another branch, this triggers 30 CI jobs to run. Most will complete silently and only burn CPU time unnecessarily. If there's a failure, this may result in unexpected failure notifications. And the AWS Graviton3 run won't complete at all and time out, since the Cirun hook will only work when triggered from the main repo. --- .github/workflows/arm64_graviton.yml | 1 + .github/workflows/c910v.yml | 1 + .github/workflows/dynamic_arch.yml | 3 +++ .github/workflows/loongarch64.yml | 1 + .github/workflows/mips64.yml | 1 + 5 files changed, 7 insertions(+) diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml index bcb05047c..9dd0ae589 100644 --- a/.github/workflows/arm64_graviton.yml +++ b/.github/workflows/arm64_graviton.yml @@ -7,6 +7,7 @@ permissions: jobs: build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: "cirun-aws-runner-graviton--${{ github.run_id }}" strategy: diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index 199304fb1..e6ed08f2c 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -7,6 +7,7 @@ permissions: jobs: TEST: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-latest env: xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282 diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 0c39bfddf..02429e317 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -7,6 +7,7 @@ permissions: jobs: build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ${{ matrix.os }} strategy: @@ -146,6 +147,7 @@ jobs: msys2: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: windows-latest strategy: @@ -312,6 +314,7 @@ jobs: cross_build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-22.04 strategy: diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index e0236ca86..fa62d0b41 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -4,6 +4,7 @@ on: [push, pull_request] jobs: TEST: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-latest strategy: fail-fast: false diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml index de7c0c0f3..7f09d4fca 100644 --- a/.github/workflows/mips64.yml +++ b/.github/workflows/mips64.yml @@ -7,6 +7,7 @@ permissions: jobs: TEST: + if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-latest strategy: fail-fast: false From 2418a20f1fb89783dc1e198fd0e2bd78ecf43e77 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Fri, 10 Nov 2023 16:05:52 +0100 Subject: [PATCH 395/718] Cancel running CI jobs when new changes are pushed to a PR The `group` expression ensures that the cancel-in-progress behavior is to only cancel if a new commit is pushed to the PR for which the job is running, not other PRs. This is a fairly standard snippet, used also in CI jobs for NumPy and other projects. --- .github/workflows/arm64_graviton.yml | 4 ++++ .github/workflows/c910v.yml | 4 ++++ .github/workflows/dynamic_arch.yml | 4 ++++ .github/workflows/loongarch64.yml | 4 ++++ .github/workflows/mips64.yml | 4 ++++ .github/workflows/nightly-Homebrew-build.yml | 4 ++++ 6 files changed, 24 insertions(+) diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml index 9dd0ae589..4382510df 100644 --- a/.github/workflows/arm64_graviton.yml +++ b/.github/workflows/arm64_graviton.yml @@ -2,6 +2,10 @@ name: arm64 graviton cirun on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + permissions: contents: read # to fetch code (actions/checkout) diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index e6ed08f2c..30cf32b34 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -2,6 +2,10 @@ name: c910v qemu test on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + permissions: contents: read # to fetch code (actions/checkout) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 02429e317..49721958a 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -2,6 +2,10 @@ name: continuous build on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + permissions: contents: read # to fetch code (actions/checkout) diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index fa62d0b41..4a9bf98b6 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -2,6 +2,10 @@ name: loongarch64 qemu test on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + jobs: TEST: if: "github.repository == 'OpenMathLib/OpenBLAS'" diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml index 7f09d4fca..4686ba713 100644 --- a/.github/workflows/mips64.yml +++ b/.github/workflows/mips64.yml @@ -2,6 +2,10 @@ name: mips64 qemu test on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + permissions: contents: read # to fetch code (actions/checkout) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index eb315f1d4..ca57fba70 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -18,6 +18,10 @@ on: name: Nightly-Homebrew-Build +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + permissions: contents: read # to fetch code (actions/checkout) From 8613632dc53eebc505b56c38af5d46123e2704b3 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Fri, 10 Nov 2023 16:23:44 +0100 Subject: [PATCH 396/718] Trigger AWS Graviton 3 CI job only for develop and release branches --- .github/workflows/arm64_graviton.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml index 4382510df..6928312b5 100644 --- a/.github/workflows/arm64_graviton.yml +++ b/.github/workflows/arm64_graviton.yml @@ -1,6 +1,14 @@ name: arm64 graviton cirun -on: [push, pull_request] +on: + push: + branches: + - develop + - release-** + pull_request: + branches: + - develop + - release-** concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} From 5ffbe646e12fe76682410674f5de4146c87aa973 Mon Sep 17 00:00:00 2001 From: Angelika Schwarz <17718454+angsch@users.noreply.github.com> Date: Sat, 11 Nov 2023 09:19:48 +0100 Subject: [PATCH 397/718] Improve matcopy interface * rows = 0 or cols = 0 is now a legal input and takes quick return path * Follow BLAS/LAPACK convention that the leading dimensions must be at least 1. --- interface/imatcopy.c | 22 ++++++++++++---------- interface/omatcopy.c | 22 ++++++++++++---------- interface/zimatcopy.c | 30 ++++++++++++++++-------------- interface/zomatcopy.c | 30 ++++++++++++++++-------------- 4 files changed, 56 insertions(+), 48 deletions(-) diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 4cf0966cc..6a1ad282c 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -100,27 +100,29 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( order == BlasColMajor) { - if ( trans == BlasNoTrans && *ldb < *rows ) info = 8; - if ( trans == BlasTrans && *ldb < *cols ) info = 8; + if ( trans == BlasNoTrans && *ldb < MAX(1,*rows) ) info = 8; + if ( trans == BlasTrans && *ldb < MAX(1,*cols) ) info = 8; } if ( order == BlasRowMajor) { - if ( trans == BlasNoTrans && *ldb < *cols ) info = 8; - if ( trans == BlasTrans && *ldb < *rows ) info = 8; + if ( trans == BlasNoTrans && *ldb < MAX(1,*cols) ) info = 8; + if ( trans == BlasTrans && *ldb < MAX(1,*rows) ) info = 8; } - if ( order == BlasColMajor && *lda < *rows ) info = 7; - if ( order == BlasRowMajor && *lda < *cols ) info = 7; - if ( *cols <= 0 ) info = 4; - if ( *rows <= 0 ) info = 3; - if ( trans < 0 ) info = 2; - if ( order < 0 ) info = 1; + if ( order == BlasColMajor && *lda < MAX(1,*rows) ) info = 7; + if ( order == BlasRowMajor && *lda < MAX(1,*cols) ) info = 7; + if ( *cols < 0 ) info = 4; + if ( *rows < 0 ) info = 3; + if ( trans < 0 ) info = 2; + if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } + if ((*rows == 0) || (*cols == 0)) return; + #ifdef NEW_IMATCOPY if ( *lda == *ldb ) { if ( order == BlasColMajor ) diff --git a/interface/omatcopy.c b/interface/omatcopy.c index 59650cfa0..c26446f5c 100644 --- a/interface/omatcopy.c +++ b/interface/omatcopy.c @@ -90,27 +90,29 @@ void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( order == BlasColMajor) { - if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; - if ( trans == BlasTrans && *ldb < *cols ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*cols) ) info = 9; } if ( order == BlasRowMajor) { - if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; - if ( trans == BlasTrans && *ldb < *rows ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*rows) ) info = 9; } - if ( order == BlasColMajor && *lda < *rows ) info = 7; - if ( order == BlasRowMajor && *lda < *cols ) info = 7; - if ( *cols <= 0 ) info = 4; - if ( *rows <= 0 ) info = 3; - if ( trans < 0 ) info = 2; - if ( order < 0 ) info = 1; + if ( order == BlasColMajor && *lda < MAX(1,*rows) ) info = 7; + if ( order == BlasRowMajor && *lda < MAX(1,*cols) ) info = 7; + if ( *cols < 0 ) info = 4; + if ( *rows < 0 ) info = 3; + if ( trans < 0 ) info = 2; + if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } + if ((*rows == 0) || (*cols == 0)) return; + if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index b0b32dc87..b66489eb7 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -101,31 +101,33 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, if ( order == BlasColMajor) { - if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; - if ( trans == BlasConj && *ldb < *rows ) info = 9; - if ( trans == BlasTrans && *ldb < *cols ) info = 9; - if ( trans == BlasTransConj && *ldb < *cols ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasConj && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasTransConj && *ldb < MAX(1,*cols) ) info = 9; } if ( order == BlasRowMajor) { - if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; - if ( trans == BlasConj && *ldb < *cols ) info = 9; - if ( trans == BlasTrans && *ldb < *rows ) info = 9; - if ( trans == BlasTransConj && *ldb < *rows ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasConj && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasTransConj && *ldb < MAX(1,*rows) ) info = 9; } - if ( order == BlasColMajor && *lda < *rows ) info = 7; - if ( order == BlasRowMajor && *lda < *cols ) info = 7; - if ( *cols <= 0 ) info = 4; - if ( *rows <= 0 ) info = 3; - if ( trans < 0 ) info = 2; - if ( order < 0 ) info = 1; + if ( order == BlasColMajor && *lda < MAX(1,*rows) ) info = 7; + if ( order == BlasRowMajor && *lda < MAX(1,*cols) ) info = 7; + if ( *cols < 0 ) info = 4; + if ( *rows < 0 ) info = 3; + if ( trans < 0 ) info = 2; + if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } + if ((*rows == 0) || (*cols == 0)) return; + #ifdef NEW_IMATCOPY if (*lda == *ldb ) { if ( order == BlasColMajor ) diff --git a/interface/zomatcopy.c b/interface/zomatcopy.c index 7345633a2..7121711d8 100644 --- a/interface/zomatcopy.c +++ b/interface/zomatcopy.c @@ -92,31 +92,33 @@ void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( order == BlasColMajor) { - if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; - if ( trans == BlasConj && *ldb < *rows ) info = 9; - if ( trans == BlasTrans && *ldb < *cols ) info = 9; - if ( trans == BlasTransConj && *ldb < *cols ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasConj && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasTransConj && *ldb < MAX(1,*cols) ) info = 9; } if ( order == BlasRowMajor) { - if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; - if ( trans == BlasConj && *ldb < *cols ) info = 9; - if ( trans == BlasTrans && *ldb < *rows ) info = 9; - if ( trans == BlasTransConj && *ldb < *rows ) info = 9; + if ( trans == BlasNoTrans && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasConj && *ldb < MAX(1,*cols) ) info = 9; + if ( trans == BlasTrans && *ldb < MAX(1,*rows) ) info = 9; + if ( trans == BlasTransConj && *ldb < MAX(1,*rows) ) info = 9; } - if ( order == BlasColMajor && *lda < *rows ) info = 7; - if ( order == BlasRowMajor && *lda < *cols ) info = 7; - if ( *cols <= 0 ) info = 4; - if ( *rows <= 0 ) info = 3; - if ( trans < 0 ) info = 2; - if ( order < 0 ) info = 1; + if ( order == BlasColMajor && *lda < MAX(1,*rows) ) info = 7; + if ( order == BlasRowMajor && *lda < MAX(1,*cols) ) info = 7; + if ( *cols < 0 ) info = 4; + if ( *rows < 0 ) info = 3; + if ( trans < 0 ) info = 2; + if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } + if ((*rows == 0) || (*cols == 0)) return; + if ( order == BlasColMajor ) { From ff6437f2d7954530ca8cd74fb4ea98631ff83398 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Nov 2023 21:30:32 +0100 Subject: [PATCH 398/718] Add workaround for omp_get_max_threads hanging on FreeBSD with libomp from LLVM14 --- driver/others/blas_server_omp.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 3e2179373..fcc0d68ad 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -124,9 +124,18 @@ void openblas_set_num_threads(int num_threads) { } int blas_thread_init(void){ - if(blas_omp_number_max <= 0) - blas_omp_number_max = omp_get_max_threads(); - + +#if defined(__FreeBSD__) && defined(__clang__) +extern int openblas_omp_num_threads_env(); + + if(blas_omp_number_max <= 0) + blas_omp_number_max= openblas_omp_num_threads_env(); + if (blas_omp_number_max <= 0) + blas_omp_number_max=MAX_CPU_NUMBER; +#else + blas_omp_number_max = /omp_get_max_threads(); +#endif + blas_get_cpu_number(); adjust_thread_buffers(); From 9324520d0ebd8f89507f59f1fe7b5d8b0f758915 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Nov 2023 23:14:58 +0100 Subject: [PATCH 399/718] typo fix --- driver/others/blas_server_omp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index fcc0d68ad..f7008fb08 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -133,7 +133,7 @@ extern int openblas_omp_num_threads_env(); if (blas_omp_number_max <= 0) blas_omp_number_max=MAX_CPU_NUMBER; #else - blas_omp_number_max = /omp_get_max_threads(); + blas_omp_number_max = omp_get_max_threads(); #endif blas_get_cpu_number(); From b6144f70ff0d6f7967c8119b068d460d6c5aaf95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Nov 2023 23:41:18 +0100 Subject: [PATCH 400/718] Change ?GECON to return INFO=1 if RCOND is NaN (Reference-LAPACK PR 926) --- lapack-netlib/SRC/cgecon.f | 39 +++++++++++++++++++++++++++++++------- lapack-netlib/SRC/dgecon.f | 39 +++++++++++++++++++++++++++++++------- lapack-netlib/SRC/sgecon.f | 39 +++++++++++++++++++++++++++++++------- lapack-netlib/SRC/zgecon.f | 39 +++++++++++++++++++++++++++++++------- 4 files changed, 128 insertions(+), 28 deletions(-) diff --git a/lapack-netlib/SRC/cgecon.f b/lapack-netlib/SRC/cgecon.f index 6f426c2ab..e018b18bb 100644 --- a/lapack-netlib/SRC/cgecon.f +++ b/lapack-netlib/SRC/cgecon.f @@ -105,8 +105,15 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> < 0: if INFO = -i, the i-th argument had an illegal value -*> =-5: if ANORM is NAN or negative. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> NaNs are illegal values for ANORM, and they propagate to +*> the output parameter RCOND. +*> Infinity is illegal for ANORM, and it propagates to the output +*> parameter RCOND as 0. +*> = 1: if RCOND = NaN, or +*> RCOND = Inf, or +*> the computed norm of the inverse of A is 0. +*> In the latter, RCOND = 0 is returned. *> \endverbatim * * Authors: @@ -117,7 +124,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gecon * * ===================================================================== SUBROUTINE CGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK, @@ -147,7 +154,7 @@ LOGICAL ONENRM CHARACTER NORMIN INTEGER IX, KASE, KASE1 - REAL AINVNM, SCALE, SL, SMLNUM, SU + REAL AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL COMPLEX ZDUM * .. * .. Local Arrays .. @@ -172,6 +179,8 @@ CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) ) * .. * .. Executable Statements .. +* + HUGEVAL = SLAMCH( 'Overflow' ) * * Test the input parameters. * @@ -183,7 +192,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO .OR. SISNAN( ANORM ) ) THEN + ELSE IF( ANORM.LT.ZERO ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN @@ -199,6 +208,13 @@ RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN + ELSE IF( SISNAN( ANORM ) ) THEN + RCOND = ANORM + INFO = -5 + RETURN + ELSE IF( ANORM.GT.HUGEVAL ) THEN + INFO = -5 + RETURN END IF * SMLNUM = SLAMCH( 'Safe minimum' ) @@ -256,8 +272,17 @@ * * Compute the estimate of the reciprocal condition number. * - IF( AINVNM.NE.ZERO ) - $ RCOND = ( ONE / AINVNM ) / ANORM + IF( AINVNM.NE.ZERO ) THEN + RCOND = ( ONE / AINVNM ) / ANORM + ELSE + INFO = 1 + RETURN + END IF +* +* Check for NaNs and Infs +* + IF( SISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL ) + $ INFO = 1 * 20 CONTINUE RETURN diff --git a/lapack-netlib/SRC/dgecon.f b/lapack-netlib/SRC/dgecon.f index 1ad302ae3..a543eb03a 100644 --- a/lapack-netlib/SRC/dgecon.f +++ b/lapack-netlib/SRC/dgecon.f @@ -105,8 +105,15 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> < 0: if INFO = -i, the i-th argument had an illegal value -*> =-5: if ANORM is NAN or negative. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> NaNs are illegal values for ANORM, and they propagate to +*> the output parameter RCOND. +*> Infinity is illegal for ANORM, and it propagates to the output +*> parameter RCOND as 0. +*> = 1: if RCOND = NaN, or +*> RCOND = Inf, or +*> the computed norm of the inverse of A is 0. +*> In the latter, RCOND = 0 is returned. *> \endverbatim * * Authors: @@ -117,7 +124,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gecon * * ===================================================================== SUBROUTINE DGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK, @@ -147,7 +154,7 @@ LOGICAL ONENRM CHARACTER NORMIN INTEGER IX, KASE, KASE1 - DOUBLE PRECISION AINVNM, SCALE, SL, SMLNUM, SU + DOUBLE PRECISION AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL * .. * .. Local Arrays .. INTEGER ISAVE( 3 ) @@ -165,6 +172,8 @@ INTRINSIC ABS, MAX * .. * .. Executable Statements .. +* + HUGEVAL = DLAMCH( 'Overflow' ) * * Test the input parameters. * @@ -176,7 +185,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO .OR. DISNAN( ANORM ) ) THEN + ELSE IF( ANORM.LT.ZERO ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN @@ -192,6 +201,13 @@ RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN + ELSE IF( DISNAN( ANORM ) ) THEN + RCOND = ANORM + INFO = -5 + RETURN + ELSE IF( ANORM.GT.HUGEVAL ) THEN + INFO = -5 + RETURN END IF * SMLNUM = DLAMCH( 'Safe minimum' ) @@ -248,8 +264,17 @@ * * Compute the estimate of the reciprocal condition number. * - IF( AINVNM.NE.ZERO ) - $ RCOND = ( ONE / AINVNM ) / ANORM + IF( AINVNM.NE.ZERO ) THEN + RCOND = ( ONE / AINVNM ) / ANORM + ELSE + INFO = 1 + RETURN + END IF +* +* Check for NaNs and Infs +* + IF( DISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL ) + $ INFO = 1 * 20 CONTINUE RETURN diff --git a/lapack-netlib/SRC/sgecon.f b/lapack-netlib/SRC/sgecon.f index 86aeea73b..82f463ebb 100644 --- a/lapack-netlib/SRC/sgecon.f +++ b/lapack-netlib/SRC/sgecon.f @@ -105,8 +105,15 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> < 0: if INFO = -i, the i-th argument had an illegal value -*> =-5: if ANORM is NAN or negative. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> NaNs are illegal values for ANORM, and they propagate to +*> the output parameter RCOND. +*> Infinity is illegal for ANORM, and it propagates to the output +*> parameter RCOND as 0. +*> = 1: if RCOND = NaN, or +*> RCOND = Inf, or +*> the computed norm of the inverse of A is 0. +*> In the latter, RCOND = 0 is returned. *> \endverbatim * * Authors: @@ -117,7 +124,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gecon * * ===================================================================== SUBROUTINE SGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK, @@ -147,7 +154,7 @@ LOGICAL ONENRM CHARACTER NORMIN INTEGER IX, KASE, KASE1 - REAL AINVNM, SCALE, SL, SMLNUM, SU + REAL AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL * .. * .. Local Arrays .. INTEGER ISAVE( 3 ) @@ -165,6 +172,8 @@ INTRINSIC ABS, MAX * .. * .. Executable Statements .. +* + HUGEVAL = SLAMCH( 'Overflow' ) * * Test the input parameters. * @@ -176,7 +185,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO .OR. SISNAN( ANORM ) ) THEN + ELSE IF( ANORM.LT.ZERO ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN @@ -192,6 +201,13 @@ RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN + ELSE IF( SISNAN( ANORM ) ) THEN + RCOND = ANORM + INFO = -5 + RETURN + ELSE IF( ANORM.GT.HUGEVAL ) THEN + INFO = -5 + RETURN END IF * SMLNUM = SLAMCH( 'Safe minimum' ) @@ -248,8 +264,17 @@ * * Compute the estimate of the reciprocal condition number. * - IF( AINVNM.NE.ZERO ) - $ RCOND = ( ONE / AINVNM ) / ANORM + IF( AINVNM.NE.ZERO ) THEN + RCOND = ( ONE / AINVNM ) / ANORM + ELSE + INFO = 1 + RETURN + END IF +* +* Check for NaNs and Infs +* + IF( SISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL ) + $ INFO = 1 * 20 CONTINUE RETURN diff --git a/lapack-netlib/SRC/zgecon.f b/lapack-netlib/SRC/zgecon.f index 9cbfe35bc..ef567d7c2 100644 --- a/lapack-netlib/SRC/zgecon.f +++ b/lapack-netlib/SRC/zgecon.f @@ -105,8 +105,15 @@ *> \verbatim *> INFO is INTEGER *> = 0: successful exit -*> < 0: if INFO = -i, the i-th argument had an illegal value -*> =-5: if ANORM is NAN or negative. +*> < 0: if INFO = -i, the i-th argument had an illegal value. +*> NaNs are illegal values for ANORM, and they propagate to +*> the output parameter RCOND. +*> Infinity is illegal for ANORM, and it propagates to the output +*> parameter RCOND as 0. +*> = 1: if RCOND = NaN, or +*> RCOND = Inf, or +*> the computed norm of the inverse of A is 0. +*> In the latter, RCOND = 0 is returned. *> \endverbatim * * Authors: @@ -117,7 +124,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gecon * * ===================================================================== SUBROUTINE ZGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK, @@ -147,7 +154,7 @@ LOGICAL ONENRM CHARACTER NORMIN INTEGER IX, KASE, KASE1 - DOUBLE PRECISION AINVNM, SCALE, SL, SMLNUM, SU + DOUBLE PRECISION AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL COMPLEX*16 ZDUM * .. * .. Local Arrays .. @@ -172,6 +179,8 @@ CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) * .. * .. Executable Statements .. +* + HUGEVAL = DLAMCH( 'Overflow' ) * * Test the input parameters. * @@ -183,7 +192,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO .OR. DISNAN( ANORM ) ) THEN + ELSE IF( ANORM.LT.ZERO ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN @@ -199,6 +208,13 @@ RETURN ELSE IF( ANORM.EQ.ZERO ) THEN RETURN + ELSE IF( DISNAN( ANORM ) ) THEN + RCOND = ANORM + INFO = -5 + RETURN + ELSE IF( ANORM.GT.HUGEVAL ) THEN + INFO = -5 + RETURN END IF * SMLNUM = DLAMCH( 'Safe minimum' ) @@ -256,8 +272,17 @@ * * Compute the estimate of the reciprocal condition number. * - IF( AINVNM.NE.ZERO ) - $ RCOND = ( ONE / AINVNM ) / ANORM + IF( AINVNM.NE.ZERO ) THEN + RCOND = ( ONE / AINVNM ) / ANORM + ELSE + INFO = 1 + RETURN + END IF +* +* Check for NaNs and Infs +* + IF( DISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL ) + $ INFO = 1 * 20 CONTINUE RETURN From 58427ff74d25952c82d1d1d8edea1afaa94f9fcd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 10:54:39 +0100 Subject: [PATCH 401/718] Deprecate ?GELQS and ?GEQRS from TESTING/LIN (Reference-LAPACK PR 900) (#4307) * Move ?GELQS and ?GEQRS from TESTING/LIN to DEPRECATED (Reference-LAPACK PR 900) * Add f2c-converted versions of ?GELQS and ?GEQRS --- cmake/lapack.cmake | 8 + lapack-netlib/SRC/DEPRECATED/cgelqs.c | 479 +++++++++++++++++ .../{TESTING/LIN => SRC/DEPRECATED}/cgelqs.f | 0 lapack-netlib/SRC/DEPRECATED/cgeqrs.c | 471 +++++++++++++++++ .../{TESTING/LIN => SRC/DEPRECATED}/cgeqrs.f | 0 lapack-netlib/SRC/DEPRECATED/dgelqs.c | 480 +++++++++++++++++ .../{TESTING/LIN => SRC/DEPRECATED}/dgelqs.f | 0 lapack-netlib/SRC/DEPRECATED/dgeqrs.c | 471 +++++++++++++++++ .../{TESTING/LIN => SRC/DEPRECATED}/dgeqrs.f | 0 lapack-netlib/SRC/DEPRECATED/sgelqs.c | 472 +++++++++++++++++ .../{TESTING/LIN => SRC/DEPRECATED}/sgelqs.f | 0 lapack-netlib/SRC/DEPRECATED/sgeqrs.c | 470 +++++++++++++++++ .../{TESTING/LIN => SRC/DEPRECATED}/sgeqrs.f | 0 lapack-netlib/SRC/DEPRECATED/zgelqs.c | 481 ++++++++++++++++++ .../{TESTING/LIN => SRC/DEPRECATED}/zgelqs.f | 0 lapack-netlib/SRC/DEPRECATED/zgeqrs.c | 472 +++++++++++++++++ .../{TESTING/LIN => SRC/DEPRECATED}/zgeqrs.f | 0 lapack-netlib/SRC/Makefile | 12 +- lapack-netlib/TESTING/LIN/CMakeLists.txt | 8 +- lapack-netlib/TESTING/LIN/Makefile | 8 +- lapack-netlib/TESTING/LIN/cchklq.f | 20 +- lapack-netlib/TESTING/LIN/cchkqr.f | 22 +- lapack-netlib/TESTING/LIN/cerrlq.f | 27 +- lapack-netlib/TESTING/LIN/cerrqr.f | 27 +- lapack-netlib/TESTING/LIN/dchklq.f | 20 +- lapack-netlib/TESTING/LIN/dchkqr.f | 22 +- lapack-netlib/TESTING/LIN/derrlq.f | 27 +- lapack-netlib/TESTING/LIN/derrqr.f | 27 +- lapack-netlib/TESTING/LIN/schklq.f | 20 +- lapack-netlib/TESTING/LIN/schkqr.f | 20 +- lapack-netlib/TESTING/LIN/serrlq.f | 27 +- lapack-netlib/TESTING/LIN/serrqr.f | 27 +- lapack-netlib/TESTING/LIN/zchklq.f | 20 +- lapack-netlib/TESTING/LIN/zchkqr.f | 20 +- lapack-netlib/TESTING/LIN/zerrlq.f | 27 +- lapack-netlib/TESTING/LIN/zerrqr.f | 27 +- 36 files changed, 3934 insertions(+), 278 deletions(-) create mode 100644 lapack-netlib/SRC/DEPRECATED/cgelqs.c rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/cgelqs.f (100%) create mode 100644 lapack-netlib/SRC/DEPRECATED/cgeqrs.c rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/cgeqrs.f (100%) create mode 100644 lapack-netlib/SRC/DEPRECATED/dgelqs.c rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/dgelqs.f (100%) create mode 100644 lapack-netlib/SRC/DEPRECATED/dgeqrs.c rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/dgeqrs.f (100%) create mode 100644 lapack-netlib/SRC/DEPRECATED/sgelqs.c rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/sgelqs.f (100%) create mode 100644 lapack-netlib/SRC/DEPRECATED/sgeqrs.c rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/sgeqrs.f (100%) create mode 100644 lapack-netlib/SRC/DEPRECATED/zgelqs.c rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/zgelqs.f (100%) create mode 100644 lapack-netlib/SRC/DEPRECATED/zgeqrs.c rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/zgeqrs.f (100%) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 5c6290484..22476f561 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -438,15 +438,19 @@ endif() if(BUILD_LAPACK_DEPRECATED) list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f + DEPRECATED/sgelqs.f DEPRECATED/sgeqrs.f DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f) list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f + DEPRECATED/dgelqs.f DEPRECATED/dgeqrs.f DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f) list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f + DEPRECATED/cgelqs.f DEPRECATED/cgeqrs.f DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f) list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f + DEPRECATED/zgelqs.f DEPRECATED/zgeqrs.f DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f) message(STATUS "Building deprecated routines") @@ -935,15 +939,19 @@ endif() if(BUILD_LAPACK_DEPRECATED) list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c + DEPRECATED/sgelqs.c DEPRECATED/sgeqrs.c DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c) list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c + DEPRECATED/dgelqs.c DEPRECATED/dgeqrs.c DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c) list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c + DEPRECATED/cgelqs.c DEPRECATED/cgeqrs.c DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c) list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c + DEPRECATED/zgelqs.c DEPRECATED/zgeqrs.c DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c) message(STATUS "Building deprecated routines") diff --git a/lapack-netlib/SRC/DEPRECATED/cgelqs.c b/lapack-netlib/SRC/DEPRECATED/cgelqs.c new file mode 100644 index 000000000..ee6d56119 --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/cgelqs.c @@ -0,0 +1,479 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static complex c_b1 = {0.f,0.f}; +static complex c_b2 = {1.f,0.f}; + +/* > \brief \b CGELQS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Compute a minimum-norm solution */ +/* > f2cmin || A*X - B || */ +/* > using the LQ factorization */ +/* > A = L*Q */ +/* > computed by CGELQF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > Details of the LQ factorization of the original matrix A as */ +/* > returned by CGELQF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is COMPLEX array, dimension (M) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= N. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex_lin */ + +/* ===================================================================== */ +/* Subroutine */ int cgelqs_(integer *m, integer *n, integer *nrhs, complex * + a, integer *lda, complex *tau, complex *b, integer *ldb, complex * + work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + integer *, integer *, complex *, complex *, integer *, complex *, + integer *), claset_(char *, + integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *), cunmlq_(char *, char + *, integer *, integer *, integer *, complex *, integer *, complex + *, complex *, integer *, complex *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *m > *n) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*n)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGELQS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* Solve L*X = B(1:m,:) */ + + ctrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b2, &a[ + a_offset], lda, &b[b_offset], ldb); + +/* Set B(m+1:n,:) to zero */ + + if (*m < *n) { + i__1 = *n - *m; + claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[*m + 1 + b_dim1], ldb); + } + +/* B := Q' * B */ + + cunmlq_("Left", "Conjugate transpose", n, nrhs, m, &a[a_offset], lda, & + tau[1], &b[b_offset], ldb, &work[1], lwork, info); + + return 0; + +/* End of CGELQS */ + +} /* cgelqs_ */ + diff --git a/lapack-netlib/TESTING/LIN/cgelqs.f b/lapack-netlib/SRC/DEPRECATED/cgelqs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/cgelqs.f rename to lapack-netlib/SRC/DEPRECATED/cgelqs.f diff --git a/lapack-netlib/SRC/DEPRECATED/cgeqrs.c b/lapack-netlib/SRC/DEPRECATED/cgeqrs.c new file mode 100644 index 000000000..c71b8af67 --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/cgeqrs.c @@ -0,0 +1,471 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; + +/* > \brief \b CGEQRS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Solve the least squares problem */ +/* > f2cmin || A*X - B || */ +/* > using the QR factorization */ +/* > A = Q*R */ +/* > computed by CGEQRF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. M >= N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > Details of the QR factorization of the original matrix A as */ +/* > returned by CGEQRF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is COMPLEX array, dimension (N) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex_lin */ + +/* ===================================================================== */ +/* Subroutine */ int cgeqrs_(integer *m, integer *n, integer *nrhs, complex * + a, integer *lda, complex *tau, complex *b, integer *ldb, complex * + work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + integer *, integer *, complex *, complex *, integer *, complex *, + integer *), xerbla_(char *, + integer *), cunmqr_(char *, char *, integer *, integer *, + integer *, complex *, integer *, complex *, complex *, integer *, + complex *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *n > *m) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*m)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGEQRS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* B := Q' * B */ + + cunmqr_("Left", "Conjugate transpose", m, nrhs, n, &a[a_offset], lda, & + tau[1], &b[b_offset], ldb, &work[1], lwork, info); + +/* Solve R*X = B(1:n,:) */ + + ctrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, &a[ + a_offset], lda, &b[b_offset], ldb); + + return 0; + +/* End of CGEQRS */ + +} /* cgeqrs_ */ + diff --git a/lapack-netlib/TESTING/LIN/cgeqrs.f b/lapack-netlib/SRC/DEPRECATED/cgeqrs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/cgeqrs.f rename to lapack-netlib/SRC/DEPRECATED/cgeqrs.f diff --git a/lapack-netlib/SRC/DEPRECATED/dgelqs.c b/lapack-netlib/SRC/DEPRECATED/dgelqs.c new file mode 100644 index 000000000..e3cf1e029 --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/dgelqs.c @@ -0,0 +1,480 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static doublereal c_b7 = 1.; +static doublereal c_b9 = 0.; + +/* > \brief \b DGELQS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Compute a minimum-norm solution */ +/* > f2cmin || A*X - B || */ +/* > using the LQ factorization */ +/* > A = L*Q */ +/* > computed by DGELQF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > Details of the LQ factorization of the original matrix A as */ +/* > returned by DGELQF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is DOUBLE PRECISION array, dimension (M) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= N. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup double_lin */ + +/* ===================================================================== */ +/* Subroutine */ int dgelqs_(integer *m, integer *n, integer *nrhs, + doublereal *a, integer *lda, doublereal *tau, doublereal *b, integer * + ldb, doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + integer *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *), dlaset_( + char *, integer *, integer *, doublereal *, doublereal *, + doublereal *, integer *), xerbla_(char *, integer *), dormlq_(char *, char *, integer *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *m > *n) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*n)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGELQS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* Solve L*X = B(1:m,:) */ + + dtrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b7, &a[ + a_offset], lda, &b[b_offset], ldb); + +/* Set B(m+1:n,:) to zero */ + + if (*m < *n) { + i__1 = *n - *m; + dlaset_("Full", &i__1, nrhs, &c_b9, &c_b9, &b[*m + 1 + b_dim1], ldb); + } + +/* B := Q' * B */ + + dormlq_("Left", "Transpose", n, nrhs, m, &a[a_offset], lda, &tau[1], &b[ + b_offset], ldb, &work[1], lwork, info); + + return 0; + +/* End of DGELQS */ + +} /* dgelqs_ */ + diff --git a/lapack-netlib/TESTING/LIN/dgelqs.f b/lapack-netlib/SRC/DEPRECATED/dgelqs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/dgelqs.f rename to lapack-netlib/SRC/DEPRECATED/dgelqs.f diff --git a/lapack-netlib/SRC/DEPRECATED/dgeqrs.c b/lapack-netlib/SRC/DEPRECATED/dgeqrs.c new file mode 100644 index 000000000..70236738a --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/dgeqrs.c @@ -0,0 +1,471 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static doublereal c_b9 = 1.; + +/* > \brief \b DGEQRS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Solve the least squares problem */ +/* > f2cmin || A*X - B || */ +/* > using the QR factorization */ +/* > A = Q*R */ +/* > computed by DGEQRF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. M >= N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > Details of the QR factorization of the original matrix A as */ +/* > returned by DGEQRF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is DOUBLE PRECISION array, dimension (N) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup double_lin */ + +/* ===================================================================== */ +/* Subroutine */ int dgeqrs_(integer *m, integer *n, integer *nrhs, + doublereal *a, integer *lda, doublereal *tau, doublereal *b, integer * + ldb, doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + integer *, integer *, doublereal *, doublereal *, integer *, + doublereal *, integer *), xerbla_( + char *, integer *), dormqr_(char *, char *, integer *, + integer *, integer *, doublereal *, integer *, doublereal *, + doublereal *, integer *, doublereal *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *n > *m) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*m)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGEQRS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* B := Q' * B */ + + dormqr_("Left", "Transpose", m, nrhs, n, &a[a_offset], lda, &tau[1], &b[ + b_offset], ldb, &work[1], lwork, info); + +/* Solve R*X = B(1:n,:) */ + + dtrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b9, &a[ + a_offset], lda, &b[b_offset], ldb); + + return 0; + +/* End of DGEQRS */ + +} /* dgeqrs_ */ + diff --git a/lapack-netlib/TESTING/LIN/dgeqrs.f b/lapack-netlib/SRC/DEPRECATED/dgeqrs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/dgeqrs.f rename to lapack-netlib/SRC/DEPRECATED/dgeqrs.f diff --git a/lapack-netlib/SRC/DEPRECATED/sgelqs.c b/lapack-netlib/SRC/DEPRECATED/sgelqs.c new file mode 100644 index 000000000..03034b0dc --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/sgelqs.c @@ -0,0 +1,472 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* Table of constant values */ + +static real c_b7 = 1.f; +static real c_b9 = 0.f; + +/* > \brief \b SGELQS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* REAL A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Compute a minimum-norm solution */ +/* > f2cmin || A*X - B || */ +/* > using the LQ factorization */ +/* > A = L*Q */ +/* > computed by SGELQF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > Details of the LQ factorization of the original matrix A as */ +/* > returned by SGELQF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is REAL array, dimension (M) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= N. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup single_lin */ + +/* ===================================================================== */ +/* Subroutine */ int sgelqs_(integer *m, integer *n, integer *nrhs, real *a, + integer *lda, real *tau, real *b, integer *ldb, real *work, integer * + lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + integer *, integer *, real *, real *, integer *, real *, integer * + ), xerbla_(char *, integer *), slaset_(char *, integer *, integer *, real *, real *, + real *, integer *), sormlq_(char *, char *, integer *, + integer *, integer *, real *, integer *, real *, real *, integer * + , real *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *m > *n) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*n)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGELQS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* Solve L*X = B(1:m,:) */ + + strsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b7, &a[ + a_offset], lda, &b[b_offset], ldb); + +/* Set B(m+1:n,:) to zero */ + + if (*m < *n) { + i__1 = *n - *m; + slaset_("Full", &i__1, nrhs, &c_b9, &c_b9, &b[*m + 1 + b_dim1], ldb); + } + +/* B := Q' * B */ + + sormlq_("Left", "Transpose", n, nrhs, m, &a[a_offset], lda, &tau[1], &b[ + b_offset], ldb, &work[1], lwork, info); + + return 0; + +/* End of SGELQS */ + +} /* sgelqs_ */ + diff --git a/lapack-netlib/TESTING/LIN/sgelqs.f b/lapack-netlib/SRC/DEPRECATED/sgelqs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/sgelqs.f rename to lapack-netlib/SRC/DEPRECATED/sgelqs.f diff --git a/lapack-netlib/SRC/DEPRECATED/sgeqrs.c b/lapack-netlib/SRC/DEPRECATED/sgeqrs.c new file mode 100644 index 000000000..b593d0dc9 --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/sgeqrs.c @@ -0,0 +1,470 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static real c_b9 = 1.f; + +/* > \brief \b SGEQRS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* REAL A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Solve the least squares problem */ +/* > f2cmin || A*X - B || */ +/* > using the QR factorization */ +/* > A = Q*R */ +/* > computed by SGEQRF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. M >= N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > Details of the QR factorization of the original matrix A as */ +/* > returned by SGEQRF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is REAL array, dimension (N) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup single_lin */ + +/* ===================================================================== */ +/* Subroutine */ int sgeqrs_(integer *m, integer *n, integer *nrhs, real *a, + integer *lda, real *tau, real *b, integer *ldb, real *work, integer * + lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + integer *, integer *, real *, real *, integer *, real *, integer * + ), xerbla_(char *, integer *), sormqr_(char *, char *, integer *, integer *, integer *, + real *, integer *, real *, real *, integer *, real *, integer *, + integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *n > *m) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*m)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGEQRS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* B := Q' * B */ + + sormqr_("Left", "Transpose", m, nrhs, n, &a[a_offset], lda, &tau[1], &b[ + b_offset], ldb, &work[1], lwork, info); + +/* Solve R*X = B(1:n,:) */ + + strsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b9, &a[ + a_offset], lda, &b[b_offset], ldb); + + return 0; + +/* End of SGEQRS */ + +} /* sgeqrs_ */ + diff --git a/lapack-netlib/TESTING/LIN/sgeqrs.f b/lapack-netlib/SRC/DEPRECATED/sgeqrs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/sgeqrs.f rename to lapack-netlib/SRC/DEPRECATED/sgeqrs.f diff --git a/lapack-netlib/SRC/DEPRECATED/zgelqs.c b/lapack-netlib/SRC/DEPRECATED/zgelqs.c new file mode 100644 index 000000000..b77ba906a --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/zgelqs.c @@ -0,0 +1,481 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static doublecomplex c_b1 = {0.,0.}; +static doublecomplex c_b2 = {1.,0.}; + +/* > \brief \b ZGELQS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX*16 A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Compute a minimum-norm solution */ +/* > f2cmin || A*X - B || */ +/* > using the LQ factorization */ +/* > A = L*Q */ +/* > computed by ZGELQF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > Details of the LQ factorization of the original matrix A as */ +/* > returned by ZGELQF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is COMPLEX*16 array, dimension (M) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= N. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX*16 array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex16_lin */ + +/* ===================================================================== */ +/* Subroutine */ int zgelqs_(integer *m, integer *n, integer *nrhs, + doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *b, + integer *ldb, doublecomplex *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + integer *, integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *), + xerbla_(char *, integer *), zlaset_(char *, integer *, + integer *, doublecomplex *, doublecomplex *, doublecomplex *, + integer *), zunmlq_(char *, char *, integer *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input parameters. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *m > *n) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*n)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGELQS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* Solve L*X = B(1:m,:) */ + + ztrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b2, &a[ + a_offset], lda, &b[b_offset], ldb); + +/* Set B(m+1:n,:) to zero */ + + if (*m < *n) { + i__1 = *n - *m; + zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[*m + 1 + b_dim1], ldb); + } + +/* B := Q' * B */ + + zunmlq_("Left", "Conjugate transpose", n, nrhs, m, &a[a_offset], lda, & + tau[1], &b[b_offset], ldb, &work[1], lwork, info); + + return 0; + +/* End of ZGELQS */ + +} /* zgelqs_ */ + diff --git a/lapack-netlib/TESTING/LIN/zgelqs.f b/lapack-netlib/SRC/DEPRECATED/zgelqs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/zgelqs.f rename to lapack-netlib/SRC/DEPRECATED/zgelqs.f diff --git a/lapack-netlib/SRC/DEPRECATED/zgeqrs.c b/lapack-netlib/SRC/DEPRECATED/zgeqrs.c new file mode 100644 index 000000000..3e8f3cce7 --- /dev/null +++ b/lapack-netlib/SRC/DEPRECATED/zgeqrs.c @@ -0,0 +1,472 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; + +/* > \brief \b ZGEQRS */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX*16 A( LDA, * ), B( LDB, * ), TAU( * ), */ +/* $ WORK( LWORK ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > Solve the least squares problem */ +/* > f2cmin || A*X - B || */ +/* > using the QR factorization */ +/* > A = Q*R */ +/* > computed by ZGEQRF. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. M >= N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of B. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > Details of the QR factorization of the original matrix A as */ +/* > returned by ZGEQRF. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TAU */ +/* > \verbatim */ +/* > TAU is COMPLEX*16 array, dimension (N) */ +/* > Details of the orthogonal matrix Q. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,NRHS) */ +/* > On entry, the m-by-nrhs right hand side matrix B. */ +/* > On exit, the n-by-nrhs solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= M. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX*16 array, dimension (LWORK) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The length of the array WORK. LWORK must be at least NRHS, */ +/* > and should be at least NRHS*NB, where NB is the block size */ +/* > for this environment. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex16_lin */ + +/* ===================================================================== */ +/* Subroutine */ int zgeqrs_(integer *m, integer *n, integer *nrhs, + doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *b, + integer *ldb, doublecomplex *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1; + + /* Local variables */ + extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + integer *, integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *), + xerbla_(char *, integer *), zunmqr_(char *, char *, + integer *, integer *, integer *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *, doublecomplex *, + integer *, integer *); + + +/* -- LAPACK test routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + if (*m < 0) { + *info = -1; + } else if (*n < 0 || *n > *m) { + *info = -2; + } else if (*nrhs < 0) { + *info = -3; + } else if (*lda < f2cmax(1,*m)) { + *info = -5; + } else if (*ldb < f2cmax(1,*m)) { + *info = -8; + } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) { + *info = -10; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGEQRS", &i__1); + return 0; + } + +/* Quick return if possible */ + + if (*n == 0 || *nrhs == 0 || *m == 0) { + return 0; + } + +/* B := Q' * B */ + + zunmqr_("Left", "Conjugate transpose", m, nrhs, n, &a[a_offset], lda, & + tau[1], &b[b_offset], ldb, &work[1], lwork, info); + +/* Solve R*X = B(1:n,:) */ + + ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, &a[ + a_offset], lda, &b[b_offset], ldb); + + return 0; + +/* End of ZGEQRS */ + +} /* zgeqrs_ */ + diff --git a/lapack-netlib/TESTING/LIN/zgeqrs.f b/lapack-netlib/SRC/DEPRECATED/zgeqrs.f similarity index 100% rename from lapack-netlib/TESTING/LIN/zgeqrs.f rename to lapack-netlib/SRC/DEPRECATED/zgeqrs.f diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 26314c4df..448fbd8df 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -544,26 +544,30 @@ endif ifeq ($(BUILD_COMPLEX),1) CDEPRECSRC = DEPRECATED/cgegs.o DEPRECATED/cgegv.o DEPRECATED/cgelsx.o \ DEPRECATED/cgeqpf.o DEPRECATED/cggsvd.o DEPRECATED/cggsvp.o \ - DEPRECATED/clahrd.o DEPRECATED/clatzm.o DEPRECATED/ctzrqf.o + DEPRECATED/clahrd.o DEPRECATED/clatzm.o DEPRECATED/ctzrqf.o \ + DEPRECATED/cgelqs.o DEPRECATED/cgeqrs.o endif ifeq ($(BUILD_DOUBLE),1) DDEPRECSRC = \ DEPRECATED/dgegs.o DEPRECATED/dgegv.o DEPRECATED/dgelsx.o \ DEPRECATED/dgeqpf.o DEPRECATED/dggsvd.o DEPRECATED/dggsvp.o \ - DEPRECATED/dlahrd.o DEPRECATED/dlatzm.o DEPRECATED/dtzrqf.o + DEPRECATED/dlahrd.o DEPRECATED/dlatzm.o DEPRECATED/dtzrqf.o \ + DEPRECATED/dgelqs.o DEPRECATED/dgeqrs.o endif ifeq ($(BUILD_SINGLE),1) SDEPRECSRC = \ DEPRECATED/sgegs.o DEPRECATED/sgegv.o DEPRECATED/sgelsx.o \ DEPRECATED/sgeqpf.o DEPRECATED/sggsvd.o DEPRECATED/sggsvp.o \ - DEPRECATED/slahrd.o DEPRECATED/slatzm.o DEPRECATED/stzrqf.o + DEPRECATED/slahrd.o DEPRECATED/slatzm.o DEPRECATED/stzrqf.o \ + DEPRECATED/sgelqs.o DEPRECATED/sgeqrs.o endif ifeq ($(BUILD_COMPLEX16),1) ZDEPRECSRC = \ DEPRECATED/zgegs.o DEPRECATED/zgegv.o DEPRECATED/zgelsx.o \ DEPRECATED/zgeqpf.o DEPRECATED/zggsvd.o DEPRECATED/zggsvp.o \ - DEPRECATED/zlahrd.o DEPRECATED/zlatzm.o DEPRECATED/ztzrqf.o + DEPRECATED/zlahrd.o DEPRECATED/zlatzm.o DEPRECATED/ztzrqf.o \ + DEPRECATED/zgelqs.o DEPRECATED/zgeqrs.o endif # filter out optimized codes from OpenBLAS diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index fc55b8a96..676857a80 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -20,7 +20,7 @@ set(SLINTST schkaa.F serrgt.f serrlq.f serrls.f serrps.f serrql.f serrqp.f serrqr.f serrrq.f serrtr.f serrtz.f - sgbt01.f sgbt02.f sgbt05.f sgelqs.f sgeqls.f sgeqrs.f + sgbt01.f sgbt02.f sgbt05.f sgeqls.f sgerqs.f sget01.f sget02.f sget03.f sget04.f sget06.f sget07.f sgtt01.f sgtt02.f sgtt05.f slaptm.f slarhs.f slatb4.f slatb5.f slattb.f slattp.f @@ -70,7 +70,7 @@ set(CLINTST cchkaa.F cerrgt.f cerrlq.f cerrls.f cerrps.f cerrql.f cerrqp.f cerrqr.f cerrrq.f cerrtr.f cerrtz.f - cgbt01.f cgbt02.f cgbt05.f cgelqs.f cgeqls.f cgeqrs.f + cgbt01.f cgbt02.f cgbt05.f cgeqls.f cgerqs.f cget01.f cget02.f cget03.f cget04.f cget07.f cgtt01.f cgtt02.f cgtt05.f chet01.f chet01_rook.f chet01_3.f @@ -121,7 +121,7 @@ set(DLINTST dchkaa.F derrgt.f derrlq.f derrls.f derrps.f derrql.f derrqp.f derrqr.f derrrq.f derrtr.f derrtz.f - dgbt01.f dgbt02.f dgbt05.f dgelqs.f dgeqls.f dgeqrs.f + dgbt01.f dgbt02.f dgbt05.f dgeqls.f dgerqs.f dget01.f dget02.f dget03.f dget04.f dget06.f dget07.f dgtt01.f dgtt02.f dgtt05.f dlaptm.f dlarhs.f dlatb4.f dlatb5.f dlattb.f dlattp.f @@ -172,7 +172,7 @@ set(ZLINTST zchkaa.F zerrgt.f zerrlq.f zerrls.f zerrps.f zerrql.f zerrqp.f zerrqr.f zerrrq.f zerrtr.f zerrtz.f - zgbt01.f zgbt02.f zgbt05.f zgelqs.f zgeqls.f zgeqrs.f + zgbt01.f zgbt02.f zgbt05.f zgeqls.f zgerqs.f zget01.f zget02.f zget03.f zget04.f zget07.f zgtt01.f zgtt02.f zgtt05.f zhet01.f zhet01_rook.f zhet01_3.f diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 54b26455e..64abc4dba 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -55,7 +55,7 @@ SLINTST = schkaa.o \ serrgt.o serrlq.o serrls.o \ serrps.o serrql.o serrqp.o serrqr.o \ serrrq.o serrtr.o serrtz.o \ - sgbt01.o sgbt02.o sgbt05.o sgelqs.o sgeqls.o sgeqrs.o \ + sgbt01.o sgbt02.o sgbt05.o sgeqls.o \ sgerqs.o sget01.o sget02.o \ sget03.o sget04.o sget06.o sget07.o sgtt01.o sgtt02.o \ sgtt05.o slaptm.o slarhs.o slatb4.o slatb5.o slattb.o slattp.o \ @@ -100,7 +100,7 @@ CLINTST = cchkaa.o \ cerrgt.o cerrlq.o \ cerrls.o cerrps.o cerrql.o cerrqp.o \ cerrqr.o cerrrq.o cerrtr.o cerrtz.o \ - cgbt01.o cgbt02.o cgbt05.o cgelqs.o cgeqls.o cgeqrs.o \ + cgbt01.o cgbt02.o cgbt05.o cgeqls.o \ cgerqs.o cget01.o cget02.o \ cget03.o cget04.o cget07.o cgtt01.o cgtt02.o \ cgtt05.o chet01.o chet01_rook.o chet01_3.o chet01_aa.o \ @@ -147,7 +147,7 @@ DLINTST = dchkaa.o \ derrgt.o derrlq.o derrls.o \ derrps.o derrql.o derrqp.o derrqr.o \ derrrq.o derrtr.o derrtz.o \ - dgbt01.o dgbt02.o dgbt05.o dgelqs.o dgeqls.o dgeqrs.o \ + dgbt01.o dgbt02.o dgbt05.o dgeqls.o \ dgerqs.o dget01.o dget02.o \ dget03.o dget04.o dget06.o dget07.o dgtt01.o dgtt02.o \ dgtt05.o dlaptm.o dlarhs.o dlatb4.o dlatb5.o dlattb.o dlattp.o \ @@ -192,7 +192,7 @@ ZLINTST = zchkaa.o \ zerrgt.o zerrlq.o \ zerrls.o zerrps.o zerrql.o zerrqp.o \ zerrqr.o zerrrq.o zerrtr.o zerrtz.o \ - zgbt01.o zgbt02.o zgbt05.o zgelqs.o zgeqls.o zgeqrs.o \ + zgbt01.o zgbt02.o zgbt05.o zgeqls.o \ zgerqs.o zget01.o zget02.o \ zget03.o zget04.o zget07.o zgtt01.o zgtt02.o \ zgtt05.o zhet01.o zhet01_rook.o zhet01_3.o zhet01_aa.o \ diff --git a/lapack-netlib/TESTING/LIN/cchklq.f b/lapack-netlib/TESTING/LIN/cchklq.f index 54107d047..4499de36f 100644 --- a/lapack-netlib/TESTING/LIN/cchklq.f +++ b/lapack-netlib/TESTING/LIN/cchklq.f @@ -235,7 +235,7 @@ REAL RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, CERRLQ, CGELQS, CGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, CERRLQ, CGELS, CGET02, $ CLACPY, CLARHS, CLATB4, CLATMS, CLQT01, CLQT02, $ CLQT03, XLAENV * .. @@ -370,7 +370,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call CGELQS to solve a system +* If M<=N and K=M, call CGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -387,14 +387,20 @@ * CALL CLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'CGELQS' - CALL CGELQS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from CGELQS. +* Reset AF to the original matrix. CGELS +* factors the matrix before solving the system. +* + CALL CLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'CGELS' + CALL CGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from CGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'CGELQS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'CGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/cchkqr.f b/lapack-netlib/TESTING/LIN/cchkqr.f index 7ea178eaf..4fa7413f9 100644 --- a/lapack-netlib/TESTING/LIN/cchkqr.f +++ b/lapack-netlib/TESTING/LIN/cchkqr.f @@ -244,7 +244,7 @@ EXTERNAL CGENND * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, CERRQR, CGEQRS, CGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, CERRQR, CGELS, CGET02, $ CLACPY, CLARHS, CLATB4, CLATMS, CQRT01, $ CQRT01P, CQRT02, CQRT03, XLAENV * .. @@ -371,7 +371,7 @@ IF( .NOT. CGENND( M, N, AF, LDA ) ) $ RESULT( 9 ) = 2*THRESH NT = NT + 1 - ELSE IF( M.GE.N ) THEN + ELSE IF( M.GE.N ) THEN * * Test CUNGQR, using factorization * returned by CQRT01 @@ -388,7 +388,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call CGEQRS to solve a system +* If M>=N and K=N, call CGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -405,14 +405,20 @@ * CALL CLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'CGEQRS' - CALL CGEQRS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from CGEQRS. +* Reset AF to the original matrix. CGELS +* factors the matrix before solving the system. +* + CALL CLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'CGELS' + CALL CGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from CGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'CGEQRS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'CGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/cerrlq.f b/lapack-netlib/TESTING/LIN/cerrlq.f index 1036835b4..495adac0d 100644 --- a/lapack-netlib/TESTING/LIN/cerrlq.f +++ b/lapack-netlib/TESTING/LIN/cerrlq.f @@ -76,7 +76,7 @@ $ W( NMAX ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL ALAESM, CGELQ2, CGELQF, CGELQS, CHKXER, CUNGL2, + EXTERNAL ALAESM, CGELQ2, CGELQF, CHKXER, CUNGL2, $ CUNGLQ, CUNML2, CUNMLQ * .. * .. Scalars in Common .. @@ -140,31 +140,6 @@ CALL CGELQ2( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'CGELQ2', INFOT, NOUT, LERR, OK ) * -* CGELQS -* - SRNAMT = 'CGELQS' - INFOT = 1 - CALL CGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL CGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL CGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL CGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL CGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL CGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL CGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK ) -* * CUNGLQ * SRNAMT = 'CUNGLQ' diff --git a/lapack-netlib/TESTING/LIN/cerrqr.f b/lapack-netlib/TESTING/LIN/cerrqr.f index 21cf22936..30ce001eb 100644 --- a/lapack-netlib/TESTING/LIN/cerrqr.f +++ b/lapack-netlib/TESTING/LIN/cerrqr.f @@ -77,7 +77,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAESM, CGEQR2, CGEQR2P, CGEQRF, CGEQRFP, - $ CGEQRS, CHKXER, CUNG2R, CUNGQR, CUNM2R, + $ CHKXER, CUNG2R, CUNGQR, CUNM2R, $ CUNMQR * .. * .. Scalars in Common .. @@ -170,31 +170,6 @@ CALL CGEQR2P( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'CGEQR2P', INFOT, NOUT, LERR, OK ) * -* CGEQRS -* - SRNAMT = 'CGEQRS' - INFOT = 1 - CALL CGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL CGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL CGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL CGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL CGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL CGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL CGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK ) -* * CUNGQR * SRNAMT = 'CUNGQR' diff --git a/lapack-netlib/TESTING/LIN/dchklq.f b/lapack-netlib/TESTING/LIN/dchklq.f index 70af41fe0..a207e0056 100644 --- a/lapack-netlib/TESTING/LIN/dchklq.f +++ b/lapack-netlib/TESTING/LIN/dchklq.f @@ -235,7 +235,7 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, DERRLQ, DGELQS, DGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, DERRLQ, DGELS, DGET02, $ DLACPY, DLARHS, DLATB4, DLATMS, DLQT01, DLQT02, $ DLQT03, XLAENV * .. @@ -373,7 +373,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call DGELQS to solve a system +* If M<=N and K=M, call DGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -390,14 +390,20 @@ * CALL DLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'DGELQS' - CALL DGELQS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from DGELQS. +* Reset AF to the original matrix. DGELS +* factors the matrix before solving the system. +* + CALL DLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'DGELS' + CALL DGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from DGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'DGELQS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'DGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/dchkqr.f b/lapack-netlib/TESTING/LIN/dchkqr.f index c729e61a9..8188d7a00 100644 --- a/lapack-netlib/TESTING/LIN/dchkqr.f +++ b/lapack-netlib/TESTING/LIN/dchkqr.f @@ -244,7 +244,7 @@ EXTERNAL DGENND * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, DERRQR, DGEQRS, DGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, DERRQR, DGELS, DGET02, $ DLACPY, DLARHS, DLATB4, DLATMS, DQRT01, $ DQRT01P, DQRT02, DQRT03, XLAENV * .. @@ -372,7 +372,7 @@ IF( .NOT. DGENND( M, N, AF, LDA ) ) $ RESULT( 9 ) = 2*THRESH NT = NT + 1 - ELSE IF( M.GE.N ) THEN + ELSE IF( M.GE.N ) THEN * * Test DORGQR, using factorization * returned by DQRT01 @@ -389,7 +389,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call DGEQRS to solve a system +* If M>=N and K=N, call DGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -406,14 +406,20 @@ * CALL DLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'DGEQRS' - CALL DGEQRS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from DGEQRS. +* Reset AF. DGELS overwrites the matrix with +* its factorization. +* + CALL DLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'DGELS' + CALL DGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from DGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'DGEQRS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'DGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/derrlq.f b/lapack-netlib/TESTING/LIN/derrlq.f index d3cfcddd0..76ff4709e 100644 --- a/lapack-netlib/TESTING/LIN/derrlq.f +++ b/lapack-netlib/TESTING/LIN/derrlq.f @@ -76,7 +76,7 @@ $ W( NMAX ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, DGELQ2, DGELQF, DGELQS, DORGL2, + EXTERNAL ALAESM, CHKXER, DGELQ2, DGELQF, DORGL2, $ DORGLQ, DORML2, DORMLQ * .. * .. Scalars in Common .. @@ -140,31 +140,6 @@ CALL DGELQ2( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'DGELQ2', INFOT, NOUT, LERR, OK ) * -* DGELQS -* - SRNAMT = 'DGELQS' - INFOT = 1 - CALL DGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL DGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL DGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL DGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL DGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL DGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL DGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK ) -* * DORGLQ * SRNAMT = 'DORGLQ' diff --git a/lapack-netlib/TESTING/LIN/derrqr.f b/lapack-netlib/TESTING/LIN/derrqr.f index 03155b133..f7e850b80 100644 --- a/lapack-netlib/TESTING/LIN/derrqr.f +++ b/lapack-netlib/TESTING/LIN/derrqr.f @@ -77,7 +77,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, DGEQR2, DGEQR2P, DGEQRF, - $ DGEQRFP, DGEQRS, DORG2R, DORGQR, DORM2R, + $ DGEQRFP, DORG2R, DORGQR, DORM2R, $ DORMQR * .. * .. Scalars in Common .. @@ -170,31 +170,6 @@ CALL DGEQR2P( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'DGEQR2P', INFOT, NOUT, LERR, OK ) * -* DGEQRS -* - SRNAMT = 'DGEQRS' - INFOT = 1 - CALL DGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL DGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL DGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL DGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL DGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL DGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL DGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK ) -* * DORGQR * SRNAMT = 'DORGQR' diff --git a/lapack-netlib/TESTING/LIN/schklq.f b/lapack-netlib/TESTING/LIN/schklq.f index cd66e8d10..9335503f9 100644 --- a/lapack-netlib/TESTING/LIN/schklq.f +++ b/lapack-netlib/TESTING/LIN/schklq.f @@ -235,7 +235,7 @@ REAL RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, SERRLQ, SGELQS, SGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, SERRLQ, SGET02, $ SLACPY, SLARHS, SLATB4, SLATMS, SLQT01, SLQT02, $ SLQT03, XLAENV * .. @@ -370,7 +370,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call SGELQS to solve a system +* If M<=N and K=M, call SGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -387,14 +387,20 @@ * CALL SLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'SGELQS' - CALL SGELQS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from SGELQS. +* Reset AF to the original matrix. SGELS +* factors the matrix before solving the system. +* + CALL SLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'SGELS' + CALL SGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from SGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'SGELQS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'SGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/schkqr.f b/lapack-netlib/TESTING/LIN/schkqr.f index 5c45ede9b..f72c8f1eb 100644 --- a/lapack-netlib/TESTING/LIN/schkqr.f +++ b/lapack-netlib/TESTING/LIN/schkqr.f @@ -244,7 +244,7 @@ EXTERNAL SGENND * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, SERRQR, SGEQRS, SGET02, + EXTERNAL ALAERH, ALAHD, ALASUM, SERRQR, SGELS, SGET02, $ SLACPY, SLARHS, SLATB4, SLATMS, SQRT01, $ SQRT01P, SQRT02, SQRT03, XLAENV * .. @@ -388,7 +388,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call SGEQRS to solve a system +* If M>=N and K=N, call SGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -405,14 +405,20 @@ * CALL SLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'SGEQRS' - CALL SGEQRS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from SGEQRS. +* Reset AF to the original matrix. SGELS +* factors the matrix before solving the system. +* + CALL SLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'SGELS' + CALL SGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from SGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'SGEQRS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'SGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/serrlq.f b/lapack-netlib/TESTING/LIN/serrlq.f index 5bb0fe201..e5df8ce52 100644 --- a/lapack-netlib/TESTING/LIN/serrlq.f +++ b/lapack-netlib/TESTING/LIN/serrlq.f @@ -76,7 +76,7 @@ $ W( NMAX ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, SGELQ2, SGELQF, SGELQS, SORGL2, + EXTERNAL ALAESM, CHKXER, SGELQ2, SGELQF, SORGL2, $ SORGLQ, SORML2, SORMLQ * .. * .. Scalars in Common .. @@ -140,31 +140,6 @@ CALL SGELQ2( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'SGELQ2', INFOT, NOUT, LERR, OK ) * -* SGELQS -* - SRNAMT = 'SGELQS' - INFOT = 1 - CALL SGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL SGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL SGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL SGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL SGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL SGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL SGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK ) -* * SORGLQ * SRNAMT = 'SORGLQ' diff --git a/lapack-netlib/TESTING/LIN/serrqr.f b/lapack-netlib/TESTING/LIN/serrqr.f index 1ad40b7aa..e228813f7 100644 --- a/lapack-netlib/TESTING/LIN/serrqr.f +++ b/lapack-netlib/TESTING/LIN/serrqr.f @@ -77,7 +77,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, SGEQR2, SGEQR2P, SGEQRF, - $ SGEQRFP, SGEQRS, SORG2R, SORGQR, SORM2R, + $ SGEQRFP, SORG2R, SORGQR, SORM2R, $ SORMQR * .. * .. Scalars in Common .. @@ -170,31 +170,6 @@ CALL SGEQR2P( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'SGEQR2P', INFOT, NOUT, LERR, OK ) * -* SGEQRS -* - SRNAMT = 'SGEQRS' - INFOT = 1 - CALL SGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL SGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL SGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL SGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL SGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL SGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL SGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK ) -* * SORGQR * SRNAMT = 'SORGQR' diff --git a/lapack-netlib/TESTING/LIN/zchklq.f b/lapack-netlib/TESTING/LIN/zchklq.f index 371bb946b..ccef7b803 100644 --- a/lapack-netlib/TESTING/LIN/zchklq.f +++ b/lapack-netlib/TESTING/LIN/zchklq.f @@ -235,7 +235,7 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZERRLQ, ZGELQS, + EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZERRLQ, ZGELS, $ ZGET02, ZLACPY, ZLARHS, ZLATB4, ZLATMS, ZLQT01, $ ZLQT02, ZLQT03 * .. @@ -370,7 +370,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call ZGELQS to solve a system +* If M<=N and K=M, call ZGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -387,14 +387,20 @@ * CALL ZLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'ZGELQS' - CALL ZGELQS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from ZGELQS. +* Reset AF to the original matrix. ZGELS +* factors the matrix before solving the system. +* + CALL ZLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'ZGELS' + CALL ZGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from ZGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'ZGELQS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'ZGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/zchkqr.f b/lapack-netlib/TESTING/LIN/zchkqr.f index a240d2da5..c088bacc9 100644 --- a/lapack-netlib/TESTING/LIN/zchkqr.f +++ b/lapack-netlib/TESTING/LIN/zchkqr.f @@ -244,7 +244,7 @@ EXTERNAL ZGENND * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZERRQR, ZGEQRS, + EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZERRQR, ZGELS, $ ZGET02, ZLACPY, ZLARHS, ZLATB4, ZLATMS, ZQRT01, $ ZQRT01P, ZQRT02, ZQRT03 * .. @@ -388,7 +388,7 @@ $ WORK, LWORK, RWORK, RESULT( 3 ) ) NT = NT + 4 * -* If M>=N and K=N, call ZGEQRS to solve a system +* If M>=N and K=N, call ZGELS to solve a system * with NRHS right hand sides and compute the * residual. * @@ -405,14 +405,20 @@ * CALL ZLACPY( 'Full', M, NRHS, B, LDA, X, $ LDA ) - SRNAMT = 'ZGEQRS' - CALL ZGEQRS( M, N, NRHS, AF, LDA, TAU, X, - $ LDA, WORK, LWORK, INFO ) * -* Check error code from ZGEQRS. +* Reset AF to the original matrix. ZGELS +* factors the matrix before solving the system. +* + CALL ZLACPY( 'Full', M, N, A, LDA, AF, LDA ) +* + SRNAMT = 'ZGELS' + CALL ZGELS( 'No transpose', M, N, NRHS, AF, + $ LDA, X, LDA, WORK, LWORK, INFO ) +* +* Check error code from ZGELS. * IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'ZGEQRS', INFO, 0, ' ', + $ CALL ALAERH( PATH, 'ZGELS', INFO, 0, 'N', $ M, N, NRHS, -1, NB, IMAT, $ NFAIL, NERRS, NOUT ) * diff --git a/lapack-netlib/TESTING/LIN/zerrlq.f b/lapack-netlib/TESTING/LIN/zerrlq.f index d8e5a8fe8..d91b4e4b3 100644 --- a/lapack-netlib/TESTING/LIN/zerrlq.f +++ b/lapack-netlib/TESTING/LIN/zerrlq.f @@ -76,7 +76,7 @@ $ W( NMAX ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, ZGELQ2, ZGELQF, ZGELQS, ZUNGL2, + EXTERNAL ALAESM, CHKXER, ZGELQ2, ZGELQF, ZUNGL2, $ ZUNGLQ, ZUNML2, ZUNMLQ * .. * .. Scalars in Common .. @@ -142,31 +142,6 @@ CALL ZGELQ2( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'ZGELQ2', INFOT, NOUT, LERR, OK ) * -* ZGELQS -* - SRNAMT = 'ZGELQS' - INFOT = 1 - CALL ZGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL ZGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL ZGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL ZGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL ZGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL ZGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL ZGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK ) -* * ZUNGLQ * SRNAMT = 'ZUNGLQ' diff --git a/lapack-netlib/TESTING/LIN/zerrqr.f b/lapack-netlib/TESTING/LIN/zerrqr.f index 114453d4c..3542c7a04 100644 --- a/lapack-netlib/TESTING/LIN/zerrqr.f +++ b/lapack-netlib/TESTING/LIN/zerrqr.f @@ -77,7 +77,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, ZGEQR2, ZGEQR2P, ZGEQRF, - $ ZGEQRFP, ZGEQRS, ZUNG2R, ZUNGQR, ZUNM2R, + $ ZGEQRFP, ZUNG2R, ZUNGQR, ZUNM2R, $ ZUNMQR * .. * .. Scalars in Common .. @@ -172,31 +172,6 @@ CALL ZGEQR2P( 2, 1, A, 1, B, W, INFO ) CALL CHKXER( 'ZGEQR2P', INFOT, NOUT, LERR, OK ) * -* ZGEQRS -* - SRNAMT = 'ZGEQRS' - INFOT = 1 - CALL ZGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL ZGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 2 - CALL ZGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 3 - CALL ZGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 5 - CALL ZGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 8 - CALL ZGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) - INFOT = 10 - CALL ZGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO ) - CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK ) -* * ZUNGQR * SRNAMT = 'ZUNGQR' From 225036fd92fb0093280dbfdafd295a4f40678917 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 13:43:22 +0100 Subject: [PATCH 402/718] Apply ROUNDUP_LWORK (Reference-LAPACK PR 904) --- lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f | 9 +++++---- lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f | 9 +++++---- lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f | 9 +++++---- lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f | 9 +++++---- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f index 743731a00..d3f78b8be 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f @@ -176,7 +176,8 @@ C> * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. @@ -225,13 +226,13 @@ C> * Optimal workspace for dlarfb = MAX(1,N)*NT * LWKOPT = (LBWORK+LLWORK)*NB - WORK( 1 ) = (LWKOPT+NT*NT) + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT+NT*NT) ELSE LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF @@ -413,7 +414,7 @@ C> END IF - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CGEQRF diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f index bbdd46113..dd8baab4d 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f @@ -176,7 +176,8 @@ C> * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + DOUBLE PRECISION DROUNDUP_LWORK + EXTERNAL ILAENV, DROUNDUP_LWORK * .. * .. Executable Statements .. @@ -225,13 +226,13 @@ C> * Optimal workspace for dlarfb = MAX(1,N)*NT * LWKOPT = (LBWORK+LLWORK)*NB - WORK( 1 ) = (LWKOPT+NT*NT) + WORK( 1 ) = DROUNDUP_LWORK(LWKOPT+NT*NT) ELSE LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = DROUNDUP_LWORK(LWKOPT) END IF @@ -413,7 +414,7 @@ C> END IF - WORK( 1 ) = IWS + WORK( 1 ) = DROUNDUP_LWORK(IWS) RETURN * * End of DGEQRF diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f index bf68d635b..93dc48fa5 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f @@ -176,7 +176,8 @@ C> * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + DOUBLE PRECISION DROUNDUP_LWORK + EXTERNAL ILAENV, DROUNDUP_LWORK * .. * .. Executable Statements .. @@ -225,13 +226,13 @@ C> * Optimal workspace for dlarfb = MAX(1,N)*NT * LWKOPT = (LBWORK+LLWORK)*NB - WORK( 1 ) = (LWKOPT+NT*NT) + WORK( 1 ) = DROUNDUP_LWORK(LWKOPT+NT*NT) ELSE LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = DROUNDUP_LWORK(LWKOPT) END IF @@ -413,7 +414,7 @@ C> END IF - WORK( 1 ) = IWS + WORK( 1 ) = DROUNDUP_LWORK(IWS) RETURN * * End of SGEQRF diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f index 06918568e..3ef07bfc7 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f @@ -176,7 +176,8 @@ C> * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. @@ -225,13 +226,13 @@ C> * Optimal workspace for dlarfb = MAX(1,N)*NT * LWKOPT = (LBWORK+LLWORK)*NB - WORK( 1 ) = (LWKOPT+NT*NT) + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT+NT*NT) ELSE LBWORK = CEILING(REAL(K)/REAL(NB))*NB LWKOPT = (LBWORK+LLWORK-NB)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF @@ -413,7 +414,7 @@ C> END IF - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of ZGEQRF From c9378badd929615aadca9120a664ab1e4bf83d11 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 13:56:06 +0100 Subject: [PATCH 403/718] Apply ROUNDUP_LWORK (Reference-LAPACK PR 904) --- lapack-netlib/SRC/cgees.f | 13 ++-- lapack-netlib/SRC/cgeesx.f | 13 ++-- lapack-netlib/SRC/cgeev.f | 14 ++-- lapack-netlib/SRC/cgeevx.f | 14 ++-- lapack-netlib/SRC/cgehrd.f | 9 +-- lapack-netlib/SRC/cgelq.f | 11 +-- lapack-netlib/SRC/cgelqf.f | 9 +-- lapack-netlib/SRC/cgelsd.f | 13 ++-- lapack-netlib/SRC/cgelss.f | 8 +-- lapack-netlib/SRC/cgelst.f | 19 +++-- lapack-netlib/SRC/cgeqlf.f | 9 +-- lapack-netlib/SRC/cgeqrf.f | 9 +-- lapack-netlib/SRC/cgeqrfp.f | 9 +-- lapack-netlib/SRC/cgerqf.f | 9 +-- lapack-netlib/SRC/cgesvd.f | 10 +-- lapack-netlib/SRC/cgetri.f | 9 +-- lapack-netlib/SRC/cgetsls.f | 15 ++-- lapack-netlib/SRC/cgges.f | 14 ++-- lapack-netlib/SRC/cggesx.f | 14 ++-- lapack-netlib/SRC/cggev.f | 14 ++-- lapack-netlib/SRC/cggevx.f | 13 ++-- lapack-netlib/SRC/cggglm.f | 7 +- lapack-netlib/SRC/cgglse.f | 7 +- lapack-netlib/SRC/cggqrf.f | 7 +- lapack-netlib/SRC/cggrqf.f | 7 +- lapack-netlib/SRC/chbev_2stage.f | 23 +++--- lapack-netlib/SRC/chbevd.f | 10 +-- lapack-netlib/SRC/chbevx_2stage.f | 29 ++++---- lapack-netlib/SRC/chbgvd.f | 9 +-- lapack-netlib/SRC/cheev.f | 10 +-- lapack-netlib/SRC/cheev_2stage.f | 25 +++---- lapack-netlib/SRC/cheevd.f | 10 +-- lapack-netlib/SRC/cheevr.f | 10 +-- lapack-netlib/SRC/cheevx.f | 10 +-- lapack-netlib/SRC/cheevx_2stage.f | 29 ++++---- lapack-netlib/SRC/chegv.f | 9 +-- lapack-netlib/SRC/chegv_2stage.f | 23 +++--- lapack-netlib/SRC/chegvd.f | 9 +-- lapack-netlib/SRC/chegvx.f | 9 +-- lapack-netlib/SRC/chesv.f | 9 +-- lapack-netlib/SRC/chesv_aa.f | 9 +-- lapack-netlib/SRC/chesv_aa_2stage.f | 7 +- lapack-netlib/SRC/chesv_rk.f | 9 +-- lapack-netlib/SRC/chesv_rook.f | 9 +-- lapack-netlib/SRC/chesvx.f | 10 +-- lapack-netlib/SRC/chetrd_hb2st.F | 107 ++++++++++++++-------------- 46 files changed, 334 insertions(+), 317 deletions(-) diff --git a/lapack-netlib/SRC/cgees.f b/lapack-netlib/SRC/cgees.f index 71acfdba3..2085dc49b 100644 --- a/lapack-netlib/SRC/cgees.f +++ b/lapack-netlib/SRC/cgees.f @@ -189,7 +189,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup gees * * ===================================================================== SUBROUTINE CGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, W, VS, @@ -230,13 +230,13 @@ * .. * .. External Subroutines .. EXTERNAL CCOPY, CGEBAK, CGEBAL, CGEHRD, CHSEQR, CLACPY, - $ CLASCL, CTRSEN, CUNGHR, SLABAD, XERBLA + $ CLASCL, CTRSEN, CUNGHR, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -292,7 +292,7 @@ MAXWRK = MAX( MAXWRK, HSWORK ) END IF END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -12 @@ -318,7 +318,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -413,7 +412,7 @@ CALL CCOPY( N, A, LDA+1, W, 1 ) END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGEES diff --git a/lapack-netlib/SRC/cgeesx.f b/lapack-netlib/SRC/cgeesx.f index 782e36747..036ae90c2 100644 --- a/lapack-netlib/SRC/cgeesx.f +++ b/lapack-netlib/SRC/cgeesx.f @@ -230,7 +230,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup geesx * * ===================================================================== SUBROUTINE CGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM, W, @@ -274,13 +274,13 @@ * .. * .. External Subroutines .. EXTERNAL CCOPY, CGEBAK, CGEBAL, CGEHRD, CHSEQR, CLACPY, - $ CLASCL, CTRSEN, CUNGHR, SLABAD, SLASCL, XERBLA + $ CLASCL, CTRSEN, CUNGHR, SLASCL, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -350,7 +350,7 @@ IF( .NOT.WANTSN ) $ LWRK = MAX( LWRK, ( N*N )/2 ) END IF - WORK( 1 ) = LWRK + WORK( 1 ) = SROUNDUP_LWORK(LWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -15 @@ -376,7 +376,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -488,7 +487,7 @@ END IF END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGEESX diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f index a77525ef8..bb41599d1 100644 --- a/lapack-netlib/SRC/cgeev.f +++ b/lapack-netlib/SRC/cgeev.f @@ -172,7 +172,7 @@ * * @generated from zgeev.f, fortran z -> c, Tue Apr 19 01:47:44 2016 * -*> \ingroup complexGEeigen +*> \ingroup geev * * ===================================================================== SUBROUTINE CGEEV( JOBVL, JOBVR, N, A, LDA, W, VL, LDVL, VR, LDVR, @@ -212,14 +212,15 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SLABAD, XERBLA, CSSCAL, CGEBAK, CGEBAL, CGEHRD, + EXTERNAL XERBLA, CSSCAL, CGEBAK, CGEBAL, CGEHRD, $ CHSEQR, CLACPY, CLASCL, CSCAL, CTREVC3, CUNGHR * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX, ILAENV - REAL SLAMCH, SCNRM2, CLANGE - EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE + REAL SLAMCH, SCNRM2, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE, + $ SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC REAL, CMPLX, CONJG, AIMAG, MAX, SQRT @@ -291,7 +292,7 @@ HSWORK = INT( WORK(1) ) MAXWRK = MAX( MAXWRK, HSWORK, MINWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -12 @@ -315,7 +316,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -493,7 +493,7 @@ END IF END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGEEV diff --git a/lapack-netlib/SRC/cgeevx.f b/lapack-netlib/SRC/cgeevx.f index 2388f5acc..5dbc394e9 100644 --- a/lapack-netlib/SRC/cgeevx.f +++ b/lapack-netlib/SRC/cgeevx.f @@ -279,7 +279,7 @@ * * @generated from zgeevx.f, fortran z -> c, Tue Apr 19 01:47:44 2016 * -*> \ingroup complexGEeigen +*> \ingroup geevx * * ===================================================================== SUBROUTINE CGEEVX( BALANC, JOBVL, JOBVR, SENSE, N, A, LDA, W, VL, @@ -323,15 +323,16 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SLABAD, SLASCL, XERBLA, CSSCAL, CGEBAK, CGEBAL, + EXTERNAL SLASCL, XERBLA, CSSCAL, CGEBAK, CGEBAL, $ CGEHRD, CHSEQR, CLACPY, CLASCL, CSCAL, CTREVC3, $ CTRSNA, CUNGHR * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX, ILAENV - REAL SLAMCH, SCNRM2, CLANGE - EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE + REAL SLAMCH, SCNRM2, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE, + $ SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC REAL, CMPLX, CONJG, AIMAG, MAX, SQRT @@ -434,7 +435,7 @@ END IF MAXWRK = MAX( MAXWRK, MINWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -20 @@ -458,7 +459,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -657,7 +657,7 @@ END IF END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGEEVX diff --git a/lapack-netlib/SRC/cgehrd.f b/lapack-netlib/SRC/cgehrd.f index d9c050267..f407f931a 100644 --- a/lapack-netlib/SRC/cgehrd.f +++ b/lapack-netlib/SRC/cgehrd.f @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -201,7 +201,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -227,7 +228,7 @@ * NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, -1 ) ) LWKOPT = N*NB + TSIZE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -344,7 +345,7 @@ * Use unblocked code to reduce the rest of the matrix * CALL CGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f index d66033166..ff482bc42 100644 --- a/lapack-netlib/SRC/cgelq.f +++ b/lapack-netlib/SRC/cgelq.f @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -190,7 +192,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGELQT, CLASWLQ, XERBLA @@ -292,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE - WORK( 1 ) = LWREQ + WORK( 1 ) = SROUNDUP_LWORK(LWREQ) END IF END IF IF( INFO.NE.0 ) THEN @@ -319,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = LWREQ + WORK( 1 ) = SROUNDUP_LWORK(LWREQ) * RETURN * diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f index 37ef13a27..75f5bc960 100644 --- a/lapack-netlib/SRC/cgelqf.f +++ b/lapack-netlib/SRC/cgelqf.f @@ -118,7 +118,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -167,7 +167,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -176,7 +177,7 @@ INFO = 0 NB = ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 ) LWKOPT = M*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -266,7 +267,7 @@ $ CALL CGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CGELQF diff --git a/lapack-netlib/SRC/cgelsd.f b/lapack-netlib/SRC/cgelsd.f index c3c77bf63..5d7eec68d 100644 --- a/lapack-netlib/SRC/cgelsd.f +++ b/lapack-netlib/SRC/cgelsd.f @@ -204,7 +204,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsolve +*> \ingroup gelsd * *> \par Contributors: * ================== @@ -249,13 +249,13 @@ * .. External Subroutines .. EXTERNAL CGEBRD, CGELQF, CGEQRF, CLACPY, $ CLALSD, CLASCL, CLASET, CUNMBR, - $ CUNMLQ, CUNMQR, SLABAD, SLASCL, + $ CUNMLQ, CUNMQR, SLASCL, $ SLASET, XERBLA * .. * .. External Functions .. INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL CLANGE, SLAMCH, ILAENV + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL CLANGE, SLAMCH, ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, LOG, MAX, MIN, REAL @@ -367,7 +367,7 @@ END IF END IF MINWRK = MIN( MINWRK, MAXWRK ) - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWORK RWORK( 1 ) = LRWORK * @@ -396,7 +396,6 @@ SFMIN = SLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A if max entry outside range [SMLNUM,BIGNUM]. * @@ -647,7 +646,7 @@ END IF * 10 CONTINUE - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWORK RWORK( 1 ) = LRWORK RETURN diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f index d1e38c504..00d7f596a 100644 --- a/lapack-netlib/SRC/cgelss.f +++ b/lapack-netlib/SRC/cgelss.f @@ -218,8 +218,8 @@ * .. * .. External Functions .. INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -361,7 +361,7 @@ END IF MAXWRK = MAX( MINWRK, MAXWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) $ INFO = -12 @@ -758,7 +758,7 @@ CALL CLASCL( 'G', 0, 0, BIGNUM, BNRM, N, NRHS, B, LDB, INFO ) END IF 70 CONTINUE - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGELSS diff --git a/lapack-netlib/SRC/cgelst.f b/lapack-netlib/SRC/cgelst.f index 7d8e44ddf..b69626934 100644 --- a/lapack-netlib/SRC/cgelst.f +++ b/lapack-netlib/SRC/cgelst.f @@ -176,7 +176,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsolve +*> \ingroup gelst * *> \par Contributors: * ================== @@ -224,15 +224,15 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE + REAL SLAMCH, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. - EXTERNAL CGELQT, CGEQRT, CGEMLQT, CGEMQRT, SLABAD, + EXTERNAL CGELQT, CGEQRT, CGEMLQT, CGEMQRT, $ CLASCL, CLASET, CTRTRS, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -270,7 +270,7 @@ * MNNRHS = MAX( MN, NRHS ) LWOPT = MAX( 1, (MN+MNNRHS)*NB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) * END IF * @@ -285,7 +285,7 @@ * IF( MIN( M, N, NRHS ).EQ.0 ) THEN CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) RETURN END IF * @@ -311,7 +311,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * @@ -334,7 +333,7 @@ * Matrix all zero. Return zero solution. * CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) RETURN END IF * @@ -524,7 +523,7 @@ $ INFO ) END IF * - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqlf.f b/lapack-netlib/SRC/cgeqlf.f index d2c11c269..918bbddad 100644 --- a/lapack-netlib/SRC/cgeqlf.f +++ b/lapack-netlib/SRC/cgeqlf.f @@ -113,7 +113,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -162,7 +162,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -186,7 +187,7 @@ NB = ILAENV( 1, 'CGEQLF', ' ', M, N, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN INFO = -7 @@ -276,7 +277,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL CGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CGEQLF diff --git a/lapack-netlib/SRC/cgeqrf.f b/lapack-netlib/SRC/cgeqrf.f index d71bd5b33..bf22a2cd3 100644 --- a/lapack-netlib/SRC/cgeqrf.f +++ b/lapack-netlib/SRC/cgeqrf.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup geqrf * *> \par Further Details: * ===================== @@ -170,7 +170,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -199,7 +200,7 @@ ELSE LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -274,7 +275,7 @@ $ CALL CGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CGEQRF diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f index 995404f43..eaf98ddf3 100644 --- a/lapack-netlib/SRC/cgeqrfp.f +++ b/lapack-netlib/SRC/cgeqrfp.f @@ -122,7 +122,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -173,7 +173,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -182,7 +183,7 @@ INFO = 0 NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 ) LWKOPT = N*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -272,7 +273,7 @@ $ CALL CGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CGEQRFP diff --git a/lapack-netlib/SRC/cgerqf.f b/lapack-netlib/SRC/cgerqf.f index d2247844c..6f914c892 100644 --- a/lapack-netlib/SRC/cgerqf.f +++ b/lapack-netlib/SRC/cgerqf.f @@ -114,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gerqf * *> \par Further Details: * ===================== @@ -163,7 +163,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -187,7 +188,7 @@ NB = ILAENV( 1, 'CGERQF', ' ', M, N, -1, -1 ) LWKOPT = M*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF ( .NOT.LQUERY ) THEN IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) @@ -278,7 +279,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL CGERQ2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CGERQF diff --git a/lapack-netlib/SRC/cgesvd.f b/lapack-netlib/SRC/cgesvd.f index 239b13431..6165a6acf 100644 --- a/lapack-netlib/SRC/cgesvd.f +++ b/lapack-netlib/SRC/cgesvd.f @@ -206,7 +206,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup gesvd * * ===================================================================== SUBROUTINE CGESVD( JOBU, JOBVT, M, N, A, LDA, S, U, LDU, VT, LDVT, @@ -259,8 +259,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -615,7 +615,7 @@ END IF END IF MAXWRK = MAX( MINWRK, MAXWRK ) - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -13 @@ -3694,7 +3694,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * RETURN * diff --git a/lapack-netlib/SRC/cgetri.f b/lapack-netlib/SRC/cgetri.f index bd7fc286c..2060d1444 100644 --- a/lapack-netlib/SRC/cgetri.f +++ b/lapack-netlib/SRC/cgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE CGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -138,7 +138,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMM, CGEMV, CSWAP, CTRSM, CTRTRI, XERBLA @@ -153,7 +154,7 @@ INFO = 0 NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 ) LWKOPT = N*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -251,7 +252,7 @@ $ CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CGETRI diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f index 8a4d02224..b4bb7562f 100644 --- a/lapack-netlib/SRC/cgetsls.f +++ b/lapack-netlib/SRC/cgetsls.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE CGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -191,15 +191,15 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, CLANGE - EXTERNAL LSAME, SLABAD, SLAMCH, CLANGE + REAL SLAMCH, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEQR, CGEMQR, CLASCL, CLASET, $ CTRTRS, XERBLA, CGELQ, CGEMLQ * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN, INT + INTRINSIC MAX, MIN, INT * .. * .. Executable Statements .. * @@ -265,7 +265,7 @@ INFO = -10 END IF * - WORK( 1 ) = REAL( WSIZEO ) + WORK( 1 ) = SROUNDUP_LWORK( WSIZEO ) * END IF * @@ -274,7 +274,7 @@ RETURN END IF IF( LQUERY ) THEN - IF( LWORK.EQ.-2 ) WORK( 1 ) = REAL( WSIZEM ) + IF( LWORK.EQ.-2 ) WORK( 1 ) = SROUNDUP_LWORK( WSIZEM ) RETURN END IF IF( LWORK.LT.WSIZEO ) THEN @@ -297,7 +297,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * @@ -485,7 +484,7 @@ END IF * 50 CONTINUE - WORK( 1 ) = REAL( TSZO + LWO ) + WORK( 1 ) = SROUNDUP_LWORK( TSZO + LWO ) RETURN * * End of CGETSLS diff --git a/lapack-netlib/SRC/cgges.f b/lapack-netlib/SRC/cgges.f index c54174da4..0ff848735 100644 --- a/lapack-netlib/SRC/cgges.f +++ b/lapack-netlib/SRC/cgges.f @@ -261,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup gges * * ===================================================================== SUBROUTINE CGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB, @@ -312,14 +312,13 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY, - $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -390,7 +389,7 @@ LWKOPT = MAX( LWKOPT, N + $ N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, -1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -18 @@ -415,7 +414,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -587,7 +585,7 @@ * 30 CONTINUE * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cggesx.f b/lapack-netlib/SRC/cggesx.f index 6385a74c1..3bf460fac 100644 --- a/lapack-netlib/SRC/cggesx.f +++ b/lapack-netlib/SRC/cggesx.f @@ -320,7 +320,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggesx * * ===================================================================== SUBROUTINE CGGESX( JOBVSL, JOBVSR, SORT, SELCTG, SENSE, N, A, LDA, @@ -373,14 +373,13 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY, - $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -476,7 +475,7 @@ MAXWRK = 1 LWRK = 1 END IF - WORK( 1 ) = LWRK + WORK( 1 ) = SROUNDUP_LWORK(LWRK) IF( WANTSN .OR. N.EQ.0 ) THEN LIWMIN = 1 ELSE @@ -510,7 +509,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -705,7 +703,7 @@ * 40 CONTINUE * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cggev.f b/lapack-netlib/SRC/cggev.f index c1c28a180..cf16e3079 100644 --- a/lapack-netlib/SRC/cggev.f +++ b/lapack-netlib/SRC/cggev.f @@ -209,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggev * * ===================================================================== SUBROUTINE CGGEV( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -254,14 +254,13 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY, - $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL, SQRT @@ -336,7 +335,7 @@ LWKOPT = MAX( LWKOPT, N + $ N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, -1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -15 @@ -359,7 +358,6 @@ EPS = SLAMCH( 'E' )*SLAMCH( 'B' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -547,7 +545,7 @@ IF( ILBSCL ) $ CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CGGEV diff --git a/lapack-netlib/SRC/cggevx.f b/lapack-netlib/SRC/cggevx.f index 405c9c3b5..fa4e92682 100644 --- a/lapack-netlib/SRC/cggevx.f +++ b/lapack-netlib/SRC/cggevx.f @@ -335,7 +335,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggevx * *> \par Further Details: * ===================== @@ -416,13 +416,13 @@ * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY, $ CLASCL, CLASET, CTGEVC, CTGSNA, CUNGQR, CUNMQR, - $ SLABAD, SLASCL, XERBLA + $ SLASCL, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANGE, SLAMCH - EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL, SQRT @@ -521,7 +521,7 @@ $ N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, 0 ) ) END IF END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -25 @@ -545,7 +545,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -793,7 +792,7 @@ IF( ILBSCL ) $ CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of CGGEVX diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f index fb384b651..0d36deca6 100644 --- a/lapack-netlib/SRC/cggglm.f +++ b/lapack-netlib/SRC/cggglm.f @@ -177,7 +177,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup ggglm * * ===================================================================== SUBROUTINE CGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK, @@ -213,7 +213,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -252,7 +253,7 @@ LWKMIN = M + N + P LWKOPT = M + NP + MAX( N, P )*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -12 diff --git a/lapack-netlib/SRC/cgglse.f b/lapack-netlib/SRC/cgglse.f index cca20dfed..b1c562385 100644 --- a/lapack-netlib/SRC/cgglse.f +++ b/lapack-netlib/SRC/cgglse.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERsolve +*> \ingroup gglse * * ===================================================================== SUBROUTINE CGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK, @@ -207,7 +207,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -246,7 +247,7 @@ LWKMIN = M + N + P LWKOPT = P + MN + MAX( M, N )*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -12 diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f index 0185f4e0d..29b0bf4af 100644 --- a/lapack-netlib/SRC/cggqrf.f +++ b/lapack-netlib/SRC/cggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -236,7 +236,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -251,7 +252,7 @@ NB3 = ILAENV( 1, 'CUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f index 5227100da..273ab3ef7 100644 --- a/lapack-netlib/SRC/cggrqf.f +++ b/lapack-netlib/SRC/cggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -235,7 +235,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -250,7 +251,7 @@ NB3 = ILAENV( 1, 'CUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/chbev_2stage.f b/lapack-netlib/SRC/chbev_2stage.f index 123d84729..f84d8d3d4 100644 --- a/lapack-netlib/SRC/chbev_2stage.f +++ b/lapack-netlib/SRC/chbev_2stage.f @@ -132,7 +132,7 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = (2KD+1)*N + KD*NTHREADS @@ -171,7 +171,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hbev_2stage * *> \par Further Details: * ===================== @@ -189,7 +189,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -197,11 +197,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -240,8 +240,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, CLANHB - EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV2STAGE + REAL SLAMCH, CLANHB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR, @@ -276,7 +277,7 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', JOBZ, $ N, KD, -1, -1 ) @@ -285,7 +286,7 @@ LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ, $ N, KD, IB, -1 ) LWMIN = LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ENDIF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) @@ -351,7 +352,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL CHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, - $ RWORK( INDE ), WORK( INDHOUS ), LHTRD, + $ RWORK( INDE ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call SSTERF. For eigenvectors, call CSTEQR. @@ -377,7 +378,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/chbevd.f b/lapack-netlib/SRC/chbevd.f index de33c9039..a5afe6b76 100644 --- a/lapack-netlib/SRC/chbevd.f +++ b/lapack-netlib/SRC/chbevd.f @@ -201,7 +201,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hbevd * * ===================================================================== SUBROUTINE CHBEVD( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, WORK, @@ -239,8 +239,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL CLANHB, SLAMCH - EXTERNAL LSAME, CLANHB, SLAMCH + REAL CLANHB, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANHB, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMM, CHBTRD, CLACPY, CLASCL, CSTEDC, SSCAL, @@ -288,7 +288,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -382,7 +382,7 @@ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN RETURN diff --git a/lapack-netlib/SRC/chbevx_2stage.f b/lapack-netlib/SRC/chbevx_2stage.f index 22bced45f..1d609dfbd 100644 --- a/lapack-netlib/SRC/chbevx_2stage.f +++ b/lapack-netlib/SRC/chbevx_2stage.f @@ -22,7 +22,7 @@ * * SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, * Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W, -* Z, LDZ, WORK, LWORK, RWORK, IWORK, +* Z, LDZ, WORK, LWORK, RWORK, IWORK, * IFAIL, INFO ) * * IMPLICIT NONE @@ -233,7 +233,7 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = (2KD+1)*N + KD*NTHREADS @@ -285,7 +285,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hbevx_2stage * *> \par Further Details: * ===================== @@ -303,7 +303,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -311,18 +311,18 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * * ===================================================================== SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB, $ Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W, - $ Z, LDZ, WORK, LWORK, RWORK, IWORK, + $ Z, LDZ, WORK, LWORK, RWORK, IWORK, $ IFAIL, INFO ) * IMPLICIT NONE @@ -367,8 +367,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, CLANHB - EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV2STAGE + REAL SLAMCH, CLANHB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANHB, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CCOPY, @@ -424,16 +425,16 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', JOBZ, $ N, KD, -1, -1 ) - LHTRD = ILAENV2STAGE( 3, 'CHETRD_HB2ST', JOBZ, + LHTRD = ILAENV2STAGE( 3, 'CHETRD_HB2ST', JOBZ, $ N, KD, IB, -1 ) - LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ, + LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ, $ N, KD, IB, -1 ) LWMIN = LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ENDIF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) @@ -637,7 +638,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/chbgvd.f b/lapack-netlib/SRC/chbgvd.f index 655006370..00fb2b5f5 100644 --- a/lapack-netlib/SRC/chbgvd.f +++ b/lapack-netlib/SRC/chbgvd.f @@ -232,7 +232,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hbgvd * *> \par Contributors: * ================== @@ -275,7 +275,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSTERF, XERBLA, CGEMM, CHBGST, CHBTRD, CLACPY, @@ -322,7 +323,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -388,7 +389,7 @@ CALL CLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ ) END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN RETURN diff --git a/lapack-netlib/SRC/cheev.f b/lapack-netlib/SRC/cheev.f index fb8e451df..60df7d8b8 100644 --- a/lapack-netlib/SRC/cheev.f +++ b/lapack-netlib/SRC/cheev.f @@ -132,7 +132,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heev * * ===================================================================== SUBROUTINE CHEEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, @@ -169,8 +169,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANHE, SLAMCH - EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH + REAL CLANHE, SLAMCH, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRD, CLASCL, CSTEQR, CUNGTR, SSCAL, SSTERF, @@ -201,7 +201,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, ( NB+1 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, 2*N-1 ) .AND. .NOT.LQUERY ) $ INFO = -8 @@ -286,7 +286,7 @@ * * Set WORK(1) to optimal complex workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cheev_2stage.f b/lapack-netlib/SRC/cheev_2stage.f index fb7989d9f..4e1cecc64 100644 --- a/lapack-netlib/SRC/cheev_2stage.f +++ b/lapack-netlib/SRC/cheev_2stage.f @@ -106,12 +106,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -149,7 +149,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heev_2stage * *> \par Further Details: * ===================== @@ -167,7 +167,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -175,11 +175,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -220,8 +220,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, CLANHE - EXTERNAL LSAME, SLAMCH, CLANHE, ILAENV2STAGE + REAL SLAMCH, CLANHE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANHE, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR, @@ -255,7 +256,7 @@ LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWMIN = N + LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) $ INFO = -8 @@ -314,7 +315,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ), - $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call SSTERF. For eigenvectors, first call @@ -343,7 +344,7 @@ * * Set WORK(1) to optimal complex workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f index dce0b2083..b5ca804eb 100644 --- a/lapack-netlib/SRC/cheevd.f +++ b/lapack-netlib/SRC/cheevd.f @@ -180,7 +180,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevd * *> \par Further Details: * ===================== @@ -230,8 +230,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANHE, SLAMCH - EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH + REAL CLANHE, SLAMCH, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRD, CLACPY, CLASCL, CSTEDC, CUNMTR, SSCAL, @@ -282,7 +282,7 @@ LROPT = LRWMIN LIOPT = LIWMIN END IF - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) RWORK( 1 ) = LROPT IWORK( 1 ) = LIOPT * @@ -378,7 +378,7 @@ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) END IF * - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) RWORK( 1 ) = LROPT IWORK( 1 ) = LIOPT * diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f index b8854b182..05c5e66be 100644 --- a/lapack-netlib/SRC/cheevr.f +++ b/lapack-netlib/SRC/cheevr.f @@ -338,7 +338,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -392,8 +392,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANSY, SLAMCH - EXTERNAL LSAME, ILAENV, CLANSY, SLAMCH + REAL CLANSY, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, CLANSY, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRD, CSSCAL, CSTEMR, CSTEIN, CSWAP, CUNMTR, @@ -454,7 +454,7 @@ NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -710,7 +710,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * diff --git a/lapack-netlib/SRC/cheevx.f b/lapack-netlib/SRC/cheevx.f index 1cec902aa..e91599a44 100644 --- a/lapack-netlib/SRC/cheevx.f +++ b/lapack-netlib/SRC/cheevx.f @@ -250,7 +250,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevx * * ===================================================================== SUBROUTINE CHEEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU, @@ -294,8 +294,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANHE - EXTERNAL LSAME, ILAENV, SLAMCH, CLANHE + REAL SLAMCH, CLANHE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, CLANHE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, @@ -354,7 +354,7 @@ NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( 1, ( NB + 1 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) @@ -552,7 +552,7 @@ * * Set WORK(1) to optimal complex workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cheevx_2stage.f b/lapack-netlib/SRC/cheevx_2stage.f index 04f6e30e8..70a681ec4 100644 --- a/lapack-netlib/SRC/cheevx_2stage.f +++ b/lapack-netlib/SRC/cheevx_2stage.f @@ -209,12 +209,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 8*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -265,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevx_2stage * *> \par Further Details: * ===================== @@ -283,7 +283,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -291,11 +291,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -335,7 +335,7 @@ CHARACTER ORDER INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, $ INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE, - $ ITMP1, J, JJ, LLWORK, + $ ITMP1, J, JJ, LLWORK, $ NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, $ SIGMA, SMLNUM, TMP1, VLL, VUU @@ -343,8 +343,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, CLANHE - EXTERNAL LSAME, SLAMCH, CLANHE, ILAENV2STAGE + REAL SLAMCH, CLANHE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANHE, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, @@ -397,7 +398,7 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE KD = ILAENV2STAGE( 1, 'CHETRD_2STAGE', JOBZ, $ N, -1, -1, -1 ) @@ -408,7 +409,7 @@ LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, $ N, KD, IB, -1 ) LWMIN = N + LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) END IF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) @@ -499,7 +500,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDD ), - $ RWORK( INDE ), WORK( INDTAU ), + $ RWORK( INDE ), WORK( INDTAU ), $ WORK( INDHOUS ), LHTRD, WORK( INDWRK ), $ LLWORK, IINFO ) * @@ -610,7 +611,7 @@ * * Set WORK(1) to optimal complex workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/chegv.f b/lapack-netlib/SRC/chegv.f index 198e5d102..53f9d5196 100644 --- a/lapack-netlib/SRC/chegv.f +++ b/lapack-netlib/SRC/chegv.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup hegv * * ===================================================================== SUBROUTINE CHEGV( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK, @@ -206,7 +206,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHEEV, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA @@ -240,7 +241,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, ( NB + 1 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, 2*N-1 ) .AND. .NOT.LQUERY ) THEN INFO = -11 @@ -309,7 +310,7 @@ END IF END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chegv_2stage.f b/lapack-netlib/SRC/chegv_2stage.f index d2b8fc795..8de1f7f06 100644 --- a/lapack-netlib/SRC/chegv_2stage.f +++ b/lapack-netlib/SRC/chegv_2stage.f @@ -144,12 +144,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -192,7 +192,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup hegv_2stage * *> \par Further Details: * ===================== @@ -210,7 +210,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -218,11 +218,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -259,7 +259,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CHEGST, CPOTRF, CTRMM, CTRSM, @@ -297,7 +298,7 @@ LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWMIN = N + LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 @@ -327,7 +328,7 @@ * Transform problem to standard eigenvalue problem and solve. * CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) - CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, + CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, $ WORK, LWORK, RWORK, INFO ) * IF( WANTZ ) THEN @@ -367,7 +368,7 @@ END IF END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f index 4edc36f2a..d2dc941e6 100644 --- a/lapack-netlib/SRC/chegvd.f +++ b/lapack-netlib/SRC/chegvd.f @@ -219,7 +219,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup hegvd * *> \par Further Details: * ===================== @@ -268,7 +268,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHEEVD, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA @@ -316,7 +317,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) RWORK( 1 ) = LROPT IWORK( 1 ) = LIOPT * @@ -392,7 +393,7 @@ END IF END IF * - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) RWORK( 1 ) = LROPT IWORK( 1 ) = LIOPT * diff --git a/lapack-netlib/SRC/chegvx.f b/lapack-netlib/SRC/chegvx.f index 8e565222d..172d0571e 100644 --- a/lapack-netlib/SRC/chegvx.f +++ b/lapack-netlib/SRC/chegvx.f @@ -293,7 +293,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup hegvx * *> \par Contributors: * ================== @@ -335,7 +335,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHEEVX, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA @@ -390,7 +391,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, ( NB + 1 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN INFO = -20 @@ -464,7 +465,7 @@ * * Set WORK(1) to optimal complex workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chesv.f b/lapack-netlib/SRC/chesv.f index 238fb0a94..cea1235b7 100644 --- a/lapack-netlib/SRC/chesv.f +++ b/lapack-netlib/SRC/chesv.f @@ -163,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEsolve +*> \ingroup hesv * * ===================================================================== SUBROUTINE CHESV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -191,7 +191,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CHETRF, CHETRS, CHETRS2 @@ -226,7 +227,7 @@ NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -259,7 +260,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f index c9b97e09c..53ecc0a16 100644 --- a/lapack-netlib/SRC/chesv_aa.f +++ b/lapack-netlib/SRC/chesv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE CHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -182,7 +182,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CHETRF_AA, CHETRS_AA @@ -217,7 +218,7 @@ $ -1, INFO ) LWKOPT_HETRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -239,7 +240,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index 36970a329..12950c4af 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -177,7 +177,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE CHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -207,7 +207,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRF_AA_2STAGE, CHETRS_AA_2STAGE, @@ -267,7 +268,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chesv_rk.f b/lapack-netlib/SRC/chesv_rk.f index e123fa299..268a55e23 100644 --- a/lapack-netlib/SRC/chesv_rk.f +++ b/lapack-netlib/SRC/chesv_rk.f @@ -205,7 +205,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEsolve +*> \ingroup hesv_rk * *> \par Contributors: * ================== @@ -247,7 +247,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CHETRF_RK, CHETRS_3 @@ -282,7 +283,7 @@ CALL CHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -304,7 +305,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chesv_rook.f b/lapack-netlib/SRC/chesv_rook.f index 8e0b1a88f..2a0d3fdaf 100644 --- a/lapack-netlib/SRC/chesv_rook.f +++ b/lapack-netlib/SRC/chesv_rook.f @@ -184,7 +184,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEsolve +*> \ingroup hesv_rook *> *> \verbatim *> @@ -225,7 +225,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CHETRF_ROOK, CHETRS_ROOK @@ -260,7 +261,7 @@ NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -283,7 +284,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chesvx.f b/lapack-netlib/SRC/chesvx.f index 6da49bdcf..c23a35ce7 100644 --- a/lapack-netlib/SRC/chesvx.f +++ b/lapack-netlib/SRC/chesvx.f @@ -276,7 +276,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE CHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -313,8 +313,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANHE, SLAMCH - EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH + REAL CLANHE, SLAMCH, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHECON, CHERFS, CHETRF, CHETRS, CLACPY, XERBLA @@ -356,7 +356,7 @@ NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -405,7 +405,7 @@ IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) $ INFO = N + 1 * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 30b01ed83..3688e40a3 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the chetrd_he2hb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the chetrd_he2hb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the chetrd_he2hb *> routine has been called to produce AB (e.g., AB is *> the output of chetrd_he2hb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -147,7 +147,7 @@ *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK @@ -188,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +208,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +216,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * @@ -259,11 +259,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SICEV, SIZETAU, LDV, LHMIN, LWMIN REAL ABSTMP @@ -277,8 +277,9 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + INTEGER ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -318,7 +319,7 @@ * IF( INFO.EQ.0 ) THEN HOUS( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) END IF * IF( INFO.NE.0 ) THEN @@ -358,7 +359,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -366,11 +367,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* complex because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* complex because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -385,17 +386,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = REAL( AB( ABDPOS, I ) ) @@ -444,7 +445,7 @@ C END IF RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the hermitian band of A to a tridiagonal matrix. * THGRSIZ = N @@ -453,7 +454,7 @@ C END IF NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL CLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL CLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -462,7 +463,7 @@ C END IF * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -471,7 +472,7 @@ C END IF #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -482,7 +483,7 @@ C END IF ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -508,16 +509,16 @@ C END IF ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -525,20 +526,20 @@ C END IF !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW ) ) -#endif +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -553,14 +554,14 @@ C END IF !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = REAL( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -575,10 +576,10 @@ C END IF ENDIF * HOUS( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of CHETRD_HB2ST * END - + From 71fbdd908d59087e8d809dd123b83f68850ec122 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 14:10:16 +0100 Subject: [PATCH 404/718] Apply ROUNDUP_LWORK (Reference-LAPACK PR 904) --- lapack-netlib/SRC/chetrd_he2hb.f | 9 +++++---- lapack-netlib/SRC/chetrf.f | 9 +++++---- lapack-netlib/SRC/chetrf_aa.f | 9 +++++---- lapack-netlib/SRC/chetrf_rk.f | 9 +++++---- lapack-netlib/SRC/chetrf_rook.f | 9 +++++---- lapack-netlib/SRC/chetri_3.f | 9 +++++---- lapack-netlib/SRC/chetrs_aa.f | 7 ++++--- lapack-netlib/SRC/chpevd.f | 10 +++++----- lapack-netlib/SRC/chpgvd.f | 9 +++++---- lapack-netlib/SRC/chseqr.f | 5 +++-- lapack-netlib/SRC/clamswlq.f | 11 +++++++---- lapack-netlib/SRC/clamtsqr.f | 9 ++++++--- lapack-netlib/SRC/claswlq.f | 14 +++++++------- lapack-netlib/SRC/clatsqr.f | 9 ++++++--- lapack-netlib/SRC/cstedc.f | 10 +++++----- lapack-netlib/SRC/cstemr.f | 8 ++++---- lapack-netlib/SRC/csysv.f | 9 +++++---- lapack-netlib/SRC/csysv_aa.f | 9 +++++---- lapack-netlib/SRC/csysv_aa_2stage.f | 7 ++++--- lapack-netlib/SRC/csysv_rk.f | 9 +++++---- lapack-netlib/SRC/csysv_rook.f | 9 +++++---- lapack-netlib/SRC/csysvx.f | 10 +++++----- lapack-netlib/SRC/csytrf.f | 9 +++++---- lapack-netlib/SRC/csytrf_aa.f | 9 +++++---- lapack-netlib/SRC/csytrf_aa_2stage.f | 7 ++++--- lapack-netlib/SRC/csytrf_rk.f | 9 +++++---- lapack-netlib/SRC/csytrf_rook.f | 9 +++++---- lapack-netlib/SRC/csytri_3.f | 9 +++++---- lapack-netlib/SRC/csytrs_aa.f | 7 ++++--- lapack-netlib/SRC/ctgsen.f | 10 +++++++--- lapack-netlib/SRC/ctgsna.f | 14 +++++++------- lapack-netlib/SRC/ctgsyl.f | 9 +++++---- lapack-netlib/SRC/ctrevc3.f | 12 ++++++------ lapack-netlib/SRC/ctrsen.f | 10 +++++----- lapack-netlib/SRC/ctzrzf.f | 9 +++++---- lapack-netlib/SRC/cunbdb.f | 8 ++++---- lapack-netlib/SRC/cunbdb1.f | 8 ++++---- lapack-netlib/SRC/cunbdb2.f | 8 ++++---- lapack-netlib/SRC/cunbdb3.f | 8 ++++---- lapack-netlib/SRC/cunbdb4.f | 8 ++++---- lapack-netlib/SRC/cuncsd.f | 8 +++++--- lapack-netlib/SRC/cuncsd2by1.f | 7 ++++--- lapack-netlib/SRC/cungbr.f | 9 +++++---- lapack-netlib/SRC/cunghr.f | 9 +++++---- lapack-netlib/SRC/cunglq.f | 9 +++++---- lapack-netlib/SRC/cungql.f | 7 ++++--- lapack-netlib/SRC/cungqr.f | 9 +++++---- lapack-netlib/SRC/cungrq.f | 9 +++++---- lapack-netlib/SRC/cungtr.f | 9 +++++---- lapack-netlib/SRC/cunmbr.f | 9 +++++---- lapack-netlib/SRC/cunmhr.f | 9 +++++---- lapack-netlib/SRC/cunmlq.f | 9 +++++---- lapack-netlib/SRC/cunmql.f | 9 +++++---- lapack-netlib/SRC/cunmqr.f | 9 +++++---- lapack-netlib/SRC/cunmrq.f | 9 +++++---- lapack-netlib/SRC/cunmrz.f | 9 +++++---- lapack-netlib/SRC/cunmtr.f | 9 +++++---- 57 files changed, 282 insertions(+), 228 deletions(-) diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f index 904555c10..090f02100 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.f +++ b/lapack-netlib/SRC/chetrd_he2hb.f @@ -158,7 +158,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -283,7 +283,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -313,7 +314,7 @@ CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN END IF * @@ -506,7 +507,7 @@ END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of CHETRD_HE2HB diff --git a/lapack-netlib/SRC/chetrf.f b/lapack-netlib/SRC/chetrf.f index 484e76256..0c596ffe7 100644 --- a/lapack-netlib/SRC/chetrf.f +++ b/lapack-netlib/SRC/chetrf.f @@ -130,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -197,7 +197,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETF2, CLAHEF, XERBLA @@ -228,7 +229,7 @@ * NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = N*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -346,7 +347,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CHETRF diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f index d9e4fbd19..0547a4eab 100644 --- a/lapack-netlib/SRC/chetrf_aa.f +++ b/lapack-netlib/SRC/chetrf_aa.f @@ -125,7 +125,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrf_aa * * ===================================================================== SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) @@ -159,7 +159,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAHEF_AA, CGEMM, CCOPY, CSWAP, CSCAL, XERBLA @@ -190,7 +191,7 @@ * IF( INFO.EQ.0 ) THEN LWKOPT = (NB+1)*N - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -459,7 +460,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CHETRF_AA diff --git a/lapack-netlib/SRC/chetrf_rk.f b/lapack-netlib/SRC/chetrf_rk.f index 3f60f4370..ef442c937 100644 --- a/lapack-netlib/SRC/chetrf_rk.f +++ b/lapack-netlib/SRC/chetrf_rk.f @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -280,7 +280,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAHEF_RK, CHETF2_RK, CSWAP, XERBLA @@ -311,7 +312,7 @@ * NB = ILAENV( 1, 'CHETRF_RK', UPLO, N, -1, -1, -1 ) LWKOPT = N*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -487,7 +488,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CHETRF_RK diff --git a/lapack-netlib/SRC/chetrf_rook.f b/lapack-netlib/SRC/chetrf_rook.f index 805e0f4cb..1593c2edc 100644 --- a/lapack-netlib/SRC/chetrf_rook.f +++ b/lapack-netlib/SRC/chetrf_rook.f @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== @@ -232,7 +232,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAHEF_ROOK, CHETF2_ROOK, XERBLA @@ -263,7 +264,7 @@ * NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -386,7 +387,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CHETRF_ROOK diff --git a/lapack-netlib/SRC/chetri_3.f b/lapack-netlib/SRC/chetri_3.f index cc2318b22..deda63598 100644 --- a/lapack-netlib/SRC/chetri_3.f +++ b/lapack-netlib/SRC/chetri_3.f @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -190,7 +190,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRI_3X, XERBLA @@ -225,7 +226,7 @@ CALL XERBLA( 'CHETRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -236,7 +237,7 @@ * CALL CHETRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f index 2546adb2d..879549106 100644 --- a/lapack-netlib/SRC/chetrs_aa.f +++ b/lapack-netlib/SRC/chetrs_aa.f @@ -123,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -155,7 +155,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME,SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACPY, CLACGV, CGTSV, CSWAP, CTRSM, XERBLA @@ -186,7 +187,7 @@ RETURN ELSE IF( LQUERY ) THEN LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * diff --git a/lapack-netlib/SRC/chpevd.f b/lapack-netlib/SRC/chpevd.f index 06d01064d..2449783a2 100644 --- a/lapack-netlib/SRC/chpevd.f +++ b/lapack-netlib/SRC/chpevd.f @@ -186,7 +186,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hpevd * * ===================================================================== SUBROUTINE CHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, @@ -223,8 +223,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL CLANHP, SLAMCH - EXTERNAL LSAME, CLANHP, SLAMCH + REAL CLANHP, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANHP, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHPTRD, CSSCAL, CSTEDC, CUPMTR, SSCAL, SSTERF, @@ -268,7 +268,7 @@ LIWMIN = 1 END IF END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -359,7 +359,7 @@ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN RETURN diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f index c24ca1360..57ac4fc72 100644 --- a/lapack-netlib/SRC/chpgvd.f +++ b/lapack-netlib/SRC/chpgvd.f @@ -212,7 +212,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHEReigen +*> \ingroup hpgvd * *> \par Contributors: * ================== @@ -246,7 +246,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHPEVD, CHPGST, CPPTRF, CTPMV, CTPSV, XERBLA @@ -292,7 +293,7 @@ END IF END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -374,7 +375,7 @@ END IF END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN RETURN diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f index 007f72f59..56ff01fc6 100644 --- a/lapack-netlib/SRC/chseqr.f +++ b/lapack-netlib/SRC/chseqr.f @@ -216,7 +216,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup hseqr * *> \par Contributors: * ================== @@ -343,7 +343,8 @@ * .. External Functions .. INTEGER ILAENV LOGICAL LSAME - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CCOPY, CLACPY, CLAHQR, CLAQR0, CLASET, XERBLA diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f index 1606cc611..5daf60bf6 100644 --- a/lapack-netlib/SRC/clamswlq.f +++ b/lapack-netlib/SRC/clamswlq.f @@ -189,6 +189,8 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE CLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, $ LDT, C, LDC, WORK, LWORK, INFO ) @@ -215,7 +217,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. External Subroutines .. EXTERNAL CTPMLQT, CGEMLQT, XERBLA * .. @@ -259,10 +262,10 @@ * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMSWLQ', -INFO ) - WORK(1) = LW + WORK(1) = SROUNDUP_LWORK(LW) RETURN ELSE IF (LQUERY) THEN - WORK(1) = LW + WORK(1) = SROUNDUP_LWORK(LW) RETURN END IF * @@ -401,7 +404,7 @@ * END IF * - WORK(1) = LW + WORK(1) = SROUNDUP_LWORK(LW) RETURN * * End of CLAMSWLQ diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f index 5677420ac..05021e642 100644 --- a/lapack-netlib/SRC/clamtsqr.f +++ b/lapack-netlib/SRC/clamtsqr.f @@ -191,6 +191,8 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE CLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, $ LDT, C, LDC, WORK, LWORK, INFO ) @@ -217,7 +219,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. External Subroutines .. EXTERNAL CGEMQRT, CTPMQRT, XERBLA * .. @@ -264,7 +267,7 @@ * Determine the block size if it is tall skinny or short and wide * IF( INFO.EQ.0) THEN - WORK(1) = LW + WORK(1) = SROUNDUP_LWORK(LW) END IF * IF( INFO.NE.0 ) THEN @@ -409,7 +412,7 @@ * END IF * - WORK(1) = LW + WORK(1) = SROUNDUP_LWORK(LW) RETURN * * End of CLAMTSQR diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f index 1a09b8305..12e8373df 100644 --- a/lapack-netlib/SRC/claswlq.f +++ b/lapack-netlib/SRC/claswlq.f @@ -159,6 +159,8 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, $ INFO) @@ -183,16 +185,14 @@ * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME - EXTERNAL LSAME + INTEGER ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. EXTERNAL SUBROUTINES .. EXTERNAL CGELQT, CTPLQT, XERBLA * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. -* .. EXTERNAL FUNCTIONS .. - INTEGER ILAENV - EXTERNAL ILAENV -* .. * .. EXECUTABLE STATEMENTS .. * * TEST THE INPUT ARGUMENTS @@ -217,7 +217,7 @@ INFO = -10 END IF IF( INFO.EQ.0) THEN - WORK(1) = MB*M + WORK(1) = SROUNDUP_LWORK(MB*M) END IF * IF( INFO.NE.0 ) THEN @@ -266,7 +266,7 @@ $ WORK, INFO ) END IF * - WORK( 1 ) = M * MB + WORK( 1 ) = SROUNDUP_LWORK(M * MB) RETURN * * End of CLASWLQ diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f index 377190081..cd2cb4aa7 100644 --- a/lapack-netlib/SRC/clatsqr.f +++ b/lapack-netlib/SRC/clatsqr.f @@ -161,6 +161,8 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, $ LWORK, INFO) @@ -185,7 +187,8 @@ * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. EXTERNAL SUBROUTINES .. EXTERNAL CGEQRT, CTPQRT, XERBLA * .. INTRINSIC FUNCTIONS .. @@ -215,7 +218,7 @@ INFO = -10 END IF IF( INFO.EQ.0) THEN - WORK(1) = NB*N + WORK(1) = SROUNDUP_LWORK(NB*N) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLATSQR', -INFO ) @@ -262,7 +265,7 @@ $ WORK, INFO ) END IF * - work( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK(N*NB) RETURN * * End of CLATSQR diff --git a/lapack-netlib/SRC/cstedc.f b/lapack-netlib/SRC/cstedc.f index 77a4ec3be..d7db591b3 100644 --- a/lapack-netlib/SRC/cstedc.f +++ b/lapack-netlib/SRC/cstedc.f @@ -192,7 +192,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup stedc * *> \par Contributors: * ================== @@ -233,8 +233,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANST - EXTERNAL ILAENV, LSAME, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CLACPY, CLACRM, CLAED0, CSTEQR, CSWAP, @@ -295,7 +295,7 @@ LRWMIN = 1 + 4*N + 2*N**2 LIWMIN = 3 + 5*N END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -466,7 +466,7 @@ END IF * 70 CONTINUE - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * diff --git a/lapack-netlib/SRC/cstemr.f b/lapack-netlib/SRC/cstemr.f index 9d47450e3..46b20d880 100644 --- a/lapack-netlib/SRC/cstemr.f +++ b/lapack-netlib/SRC/cstemr.f @@ -376,8 +376,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANST - EXTERNAL LSAME, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARRV, CSWAP, SCOPY, SLAE2, SLAEV2, SLARRC, @@ -462,7 +462,7 @@ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( WANTZ .AND. ALLEIG ) THEN @@ -801,7 +801,7 @@ ENDIF * * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/csysv.f b/lapack-netlib/SRC/csysv.f index 4ddabf62f..a2d1e7cbe 100644 --- a/lapack-netlib/SRC/csysv.f +++ b/lapack-netlib/SRC/csysv.f @@ -163,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYsolve +*> \ingroup hesv * * ===================================================================== SUBROUTINE CSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -190,7 +190,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CSYTRF, CSYTRS, CSYTRS2 @@ -225,7 +226,7 @@ CALL CSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -258,7 +259,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csysv_aa.f b/lapack-netlib/SRC/csysv_aa.f index 8548c2789..571a91123 100644 --- a/lapack-netlib/SRC/csysv_aa.f +++ b/lapack-netlib/SRC/csysv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE CSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -182,7 +182,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CSYTRF_AA, CSYTRS_AA @@ -217,7 +218,7 @@ $ -1, INFO ) LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -239,7 +240,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f index 22227505c..10119d8ba 100644 --- a/lapack-netlib/SRC/csysv_aa_2stage.f +++ b/lapack-netlib/SRC/csysv_aa_2stage.f @@ -177,7 +177,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE CSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -207,7 +207,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CSYTRF_AA_2STAGE, @@ -267,7 +268,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csysv_rk.f b/lapack-netlib/SRC/csysv_rk.f index ef5334dcd..cb98ab1dc 100644 --- a/lapack-netlib/SRC/csysv_rk.f +++ b/lapack-netlib/SRC/csysv_rk.f @@ -205,7 +205,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYsolve +*> \ingroup hesv_rk * *> \par Contributors: * ================== @@ -247,7 +247,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CSYTRF_RK, CSYTRS_3 @@ -282,7 +283,7 @@ CALL CSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -304,7 +305,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csysv_rook.f b/lapack-netlib/SRC/csysv_rook.f index aad594e21..8798ddfb2 100644 --- a/lapack-netlib/SRC/csysv_rook.f +++ b/lapack-netlib/SRC/csysv_rook.f @@ -181,7 +181,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYsolve +*> \ingroup hesv_rook * *> \par Contributors: * ================== @@ -223,7 +223,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CSYTRF_ROOK, CSYTRS_ROOK @@ -258,7 +259,7 @@ CALL CSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -281,7 +282,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csysvx.f b/lapack-netlib/SRC/csysvx.f index 2afa082a9..3c7a37889 100644 --- a/lapack-netlib/SRC/csysvx.f +++ b/lapack-netlib/SRC/csysvx.f @@ -276,7 +276,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE CSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -313,8 +313,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL CLANSY, SLAMCH - EXTERNAL ILAENV, LSAME, CLANSY, SLAMCH + REAL CLANSY, SLAMCH, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, CLANSY, SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACPY, CSYCON, CSYRFS, CSYTRF, CSYTRS, XERBLA @@ -356,7 +356,7 @@ NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -405,7 +405,7 @@ IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) $ INFO = N + 1 * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csytrf.f b/lapack-netlib/SRC/csytrf.f index 951196b83..519e78490 100644 --- a/lapack-netlib/SRC/csytrf.f +++ b/lapack-netlib/SRC/csytrf.f @@ -135,7 +135,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -202,7 +202,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLASYF, CSYTF2, XERBLA @@ -233,7 +234,7 @@ * NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -351,7 +352,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CSYTRF diff --git a/lapack-netlib/SRC/csytrf_aa.f b/lapack-netlib/SRC/csytrf_aa.f index c5467bf01..cf994913d 100644 --- a/lapack-netlib/SRC/csytrf_aa.f +++ b/lapack-netlib/SRC/csytrf_aa.f @@ -125,7 +125,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_aa * * ===================================================================== SUBROUTINE CSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) @@ -159,7 +159,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLASYF_AA, CGEMM, CGEMV, CSCAL, CSWAP, CCOPY, @@ -191,7 +192,7 @@ * IF( INFO.EQ.0 ) THEN LWKOPT = (NB+1)*N - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -457,7 +458,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CSYTRF_AA diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f index b21df8cd3..e56aedaf6 100644 --- a/lapack-netlib/SRC/csytrf_aa_2stage.f +++ b/lapack-netlib/SRC/csytrf_aa_2stage.f @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -188,7 +188,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CCOPY, CGBTRF, CGEMM, CGETRF, CLACPY, @@ -230,7 +231,7 @@ TB( 1 ) = (3*NB+1)*N END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK(N*NB) END IF END IF IF( TQUERY .OR. WQUERY ) THEN diff --git a/lapack-netlib/SRC/csytrf_rk.f b/lapack-netlib/SRC/csytrf_rk.f index 996801e7d..de39bda41 100644 --- a/lapack-netlib/SRC/csytrf_rk.f +++ b/lapack-netlib/SRC/csytrf_rk.f @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -280,7 +280,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLASYF_RK, CSYTF2_RK, CSWAP, XERBLA @@ -311,7 +312,7 @@ * NB = ILAENV( 1, 'CSYTRF_RK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -487,7 +488,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CSYTRF_RK diff --git a/lapack-netlib/SRC/csytrf_rook.f b/lapack-netlib/SRC/csytrf_rook.f index ce7c1e586..72fe0629f 100644 --- a/lapack-netlib/SRC/csytrf_rook.f +++ b/lapack-netlib/SRC/csytrf_rook.f @@ -146,7 +146,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== @@ -228,7 +228,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLASYF_ROOK, CSYTF2_ROOK, XERBLA @@ -259,7 +260,7 @@ * NB = ILAENV( 1, 'CSYTRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -382,7 +383,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CSYTRF_ROOK diff --git a/lapack-netlib/SRC/csytri_3.f b/lapack-netlib/SRC/csytri_3.f index 279f62853..604d84b21 100644 --- a/lapack-netlib/SRC/csytri_3.f +++ b/lapack-netlib/SRC/csytri_3.f @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -190,7 +190,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CSYTRI_3X, XERBLA @@ -225,7 +226,7 @@ CALL XERBLA( 'CSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -236,7 +237,7 @@ * CALL CSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/csytrs_aa.f b/lapack-netlib/SRC/csytrs_aa.f index 1f6ea40af..7f63539a6 100644 --- a/lapack-netlib/SRC/csytrs_aa.f +++ b/lapack-netlib/SRC/csytrs_aa.f @@ -123,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE CSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -155,7 +155,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACPY, CGTSV, CSWAP, CTRSM, XERBLA @@ -186,7 +187,7 @@ RETURN ELSE IF( LQUERY ) THEN LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * diff --git a/lapack-netlib/SRC/ctgsen.f b/lapack-netlib/SRC/ctgsen.f index ffd638099..180e96b32 100644 --- a/lapack-netlib/SRC/ctgsen.f +++ b/lapack-netlib/SRC/ctgsen.f @@ -290,7 +290,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup tgsen * *> \par Further Details: * ===================== @@ -467,6 +467,10 @@ * .. Local Arrays .. INTEGER ISAVE( 3 ) * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. REAL SLAMCH EXTERNAL CLACN2, CLACPY, CLASSQ, CSCAL, CTGEXC, CTGSYL, @@ -537,7 +541,7 @@ LIWMIN = 1 END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -771,7 +775,7 @@ * 70 CONTINUE * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ctgsna.f b/lapack-netlib/SRC/ctgsna.f index 2295dc5cc..50498c413 100644 --- a/lapack-netlib/SRC/ctgsna.f +++ b/lapack-netlib/SRC/ctgsna.f @@ -213,7 +213,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup tgsna * *> \par Further Details: * ===================== @@ -343,12 +343,13 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SCNRM2, SLAMCH, SLAPY2 + REAL SCNRM2, SLAMCH, SLAPY2, SROUNDUP_LWORK COMPLEX CDOTC - EXTERNAL LSAME, SCNRM2, SLAMCH, SLAPY2, CDOTC + EXTERNAL LSAME, SCNRM2, SLAMCH, SLAPY2, SROUNDUP_LWORK, + $ CDOTC * .. * .. External Subroutines .. - EXTERNAL CGEMV, CLACPY, CTGEXC, CTGSYL, SLABAD, XERBLA + EXTERNAL CGEMV, CLACPY, CTGEXC, CTGSYL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, CMPLX, MAX @@ -402,7 +403,7 @@ ELSE LWMIN = N END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( MM.LT.M ) THEN INFO = -15 @@ -428,7 +429,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) / EPS BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) KS = 0 DO 20 K = 1, N * @@ -508,7 +508,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of CTGSNA diff --git a/lapack-netlib/SRC/ctgsyl.f b/lapack-netlib/SRC/ctgsyl.f index ae1437125..620556399 100644 --- a/lapack-netlib/SRC/ctgsyl.f +++ b/lapack-netlib/SRC/ctgsyl.f @@ -260,7 +260,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup tgsyl * *> \par Contributors: * ================== @@ -329,7 +329,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMM, CLACPY, CLASET, CSCAL, CTGSY2, XERBLA @@ -382,7 +383,7 @@ ELSE LWMIN = 1 END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -20 @@ -683,7 +684,7 @@ 210 CONTINUE END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f index 11b32104d..13cbf553f 100644 --- a/lapack-netlib/SRC/ctrevc3.f +++ b/lapack-netlib/SRC/ctrevc3.f @@ -222,7 +222,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup trevc3 * *> \par Further Details: * ===================== @@ -278,12 +278,13 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ICAMAX - REAL SLAMCH, SCASUM - EXTERNAL LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM + REAL SLAMCH, SCASUM, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, CCOPY, CLASET, CSSCAL, CGEMM, CGEMV, - $ CLATRS, CLACPY, SLABAD + $ CLATRS, CLACPY * .. * .. Intrinsic Functions .. INTRINSIC ABS, REAL, CMPLX, CONJG, AIMAG, MAX @@ -322,7 +323,7 @@ INFO = 0 NB = ILAENV( 1, 'CTREVC', SIDE // HOWMNY, N, -1, -1, -1 ) MAXWRK = MAX( 1, N + 2*N*NB ) - WORK(1) = MAXWRK + WORK(1) = SROUNDUP_LWORK(MAXWRK) RWORK(1) = MAX( 1, N ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN @@ -371,7 +372,6 @@ * UNFL = SLAMCH( 'Safe minimum' ) OVFL = ONE / UNFL - CALL SLABAD( UNFL, OVFL ) ULP = SLAMCH( 'Precision' ) SMLNUM = UNFL*( N / ULP ) * diff --git a/lapack-netlib/SRC/ctrsen.f b/lapack-netlib/SRC/ctrsen.f index d93b97be6..9d59f6bf2 100644 --- a/lapack-netlib/SRC/ctrsen.f +++ b/lapack-netlib/SRC/ctrsen.f @@ -182,7 +182,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup trsen * *> \par Further Details: * ===================== @@ -293,8 +293,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE - EXTERNAL LSAME, CLANGE + REAL CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACN2, CLACPY, CTREXC, CTRSYL, XERBLA @@ -350,7 +350,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) END IF * IF( INFO.NE.0 ) THEN @@ -444,7 +444,7 @@ W( K ) = T( K, K ) 50 CONTINUE * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ctzrzf.f b/lapack-netlib/SRC/ctzrzf.f index b21f092ce..ac3f59400 100644 --- a/lapack-netlib/SRC/ctzrzf.f +++ b/lapack-netlib/SRC/ctzrzf.f @@ -116,7 +116,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup tzrzf * *> \par Contributors: * ================== @@ -179,7 +179,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -207,7 +208,7 @@ LWKOPT = M*NB LWKMIN = MAX( 1, M ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 @@ -301,7 +302,7 @@ IF( MU.GT.0 ) $ CALL CLATRZ( MU, N, N-M, A, LDA, TAU, WORK ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cunbdb.f b/lapack-netlib/SRC/cunbdb.f index a41895dc8..b45dcfde6 100644 --- a/lapack-netlib/SRC/cunbdb.f +++ b/lapack-netlib/SRC/cunbdb.f @@ -255,7 +255,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb * *> \par Further Details: * ===================== @@ -320,9 +320,9 @@ * * .. * .. External Functions .. - REAL SCNRM2 + REAL SCNRM2, SROUNDUP_LWORK LOGICAL LSAME - EXTERNAL SCNRM2, LSAME + EXTERNAL SCNRM2, SROUNDUP_LWORK, LSAME * .. * .. Intrinsic Functions INTRINSIC ATAN2, COS, MAX, MIN, SIN @@ -377,7 +377,7 @@ IF( INFO .EQ. 0 ) THEN LWORKOPT = M - Q LWORKMIN = M - Q - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT. LQUERY ) THEN INFO = -21 END IF diff --git a/lapack-netlib/SRC/cunbdb1.f b/lapack-netlib/SRC/cunbdb1.f index 80faa8808..a4875ab5b 100644 --- a/lapack-netlib/SRC/cunbdb1.f +++ b/lapack-netlib/SRC/cunbdb1.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb1 * *> \par Further Details: * ===================== @@ -230,8 +230,8 @@ EXTERNAL CLACGV * .. * .. External Functions .. - REAL SCNRM2 - EXTERNAL SCNRM2 + REAL SCNRM2, SROUNDUP_LWORK + EXTERNAL SCNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Function .. INTRINSIC ATAN2, COS, MAX, SIN, SQRT @@ -264,7 +264,7 @@ LORBDB5 = Q-2 LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 ) LWORKMIN = LWORKOPT - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f index 94b9fdbf9..6399964f8 100644 --- a/lapack-netlib/SRC/cunbdb2.f +++ b/lapack-netlib/SRC/cunbdb2.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb2 * *> \par Further Details: * ===================== @@ -231,8 +231,8 @@ $ XERBLA * .. * .. External Functions .. - REAL SCNRM2 - EXTERNAL SCNRM2 + REAL SCNRM2, SROUNDUP_LWORK + EXTERNAL SCNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Function .. INTRINSIC ATAN2, COS, MAX, SIN, SQRT @@ -265,7 +265,7 @@ LORBDB5 = Q-1 LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 ) LWORKMIN = LWORKOPT - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF diff --git a/lapack-netlib/SRC/cunbdb3.f b/lapack-netlib/SRC/cunbdb3.f index f942bc698..d02460597 100644 --- a/lapack-netlib/SRC/cunbdb3.f +++ b/lapack-netlib/SRC/cunbdb3.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb3 * *> \par Further Details: * ===================== @@ -229,8 +229,8 @@ EXTERNAL CLARF, CLARFGP, CUNBDB5, CSROT, CLACGV, XERBLA * .. * .. External Functions .. - REAL SCNRM2 - EXTERNAL SCNRM2 + REAL SCNRM2, SROUNDUP_LWORK + EXTERNAL SCNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Function .. INTRINSIC ATAN2, COS, MAX, SIN, SQRT @@ -263,7 +263,7 @@ LORBDB5 = Q-1 LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 ) LWORKMIN = LWORKOPT - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f index a551c184e..33acc1ee5 100644 --- a/lapack-netlib/SRC/cunbdb4.f +++ b/lapack-netlib/SRC/cunbdb4.f @@ -183,7 +183,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb4 * *> \par Further Details: * ===================== @@ -242,8 +242,8 @@ $ XERBLA * .. * .. External Functions .. - REAL SCNRM2 - EXTERNAL SCNRM2 + REAL SCNRM2, SROUNDUP_LWORK + EXTERNAL SCNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Function .. INTRINSIC ATAN2, COS, MAX, SIN, SQRT @@ -277,7 +277,7 @@ LWORKOPT = ILARF + LLARF - 1 LWORKOPT = MAX( LWORKOPT, IORBDB5 + LORBDB5 - 1 ) LWORKMIN = LWORKOPT - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN INFO = -14 END IF diff --git a/lapack-netlib/SRC/cuncsd.f b/lapack-netlib/SRC/cuncsd.f index 3653a396a..003daaab4 100644 --- a/lapack-netlib/SRC/cuncsd.f +++ b/lapack-netlib/SRC/cuncsd.f @@ -308,7 +308,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup uncsd * * ===================================================================== RECURSIVE SUBROUTINE CUNCSD( JOBU1, JOBU2, JOBV1T, JOBV2T, TRANS, @@ -365,7 +365,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. Intrinsic Functions INTRINSIC INT, MAX, MIN @@ -504,7 +505,8 @@ $ IORBDB + LORBDBWORKOPT ) - 1 LWORKMIN = MAX( IORGQR + LORGQRWORKMIN, IORGLQ + LORGLQWORKMIN, $ IORBDB + LORBDBWORKMIN ) - 1 - WORK(1) = MAX(LWORKOPT,LWORKMIN) + LWORKOPT = MAX(LWORKOPT,LWORKMIN) + WORK(1) = SROUNDUP_LWORK(LWORKOPT) * IF( LWORK .LT. LWORKMIN $ .AND. .NOT. ( LQUERY .OR. LRQUERY ) ) THEN diff --git a/lapack-netlib/SRC/cuncsd2by1.f b/lapack-netlib/SRC/cuncsd2by1.f index f0c44f670..128e82cec 100644 --- a/lapack-netlib/SRC/cuncsd2by1.f +++ b/lapack-netlib/SRC/cuncsd2by1.f @@ -247,7 +247,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup uncsd2by1 * * ===================================================================== SUBROUTINE CUNCSD2BY1( JOBU1, JOBU2, JOBV1T, M, P, Q, X11, LDX11, @@ -299,7 +299,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. Intrinsic Function .. INTRINSIC INT, MAX, MIN @@ -508,7 +509,7 @@ LWORKOPT = MAX( IORBDB+LORBDB-1, $ IORGQR+LORGQROPT-1, $ IORGLQ+LORGLQOPT-1 ) - WORK(1) = LWORKOPT + WORK(1) = SROUNDUP_LWORK(LWORKOPT) IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f index a31a53d79..2f0208fdb 100644 --- a/lapack-netlib/SRC/cungbr.f +++ b/lapack-netlib/SRC/cungbr.f @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGBcomputational +*> \ingroup ungbr * * ===================================================================== SUBROUTINE CUNGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -180,7 +180,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CUNGLQ, CUNGQR, XERBLA @@ -241,7 +242,7 @@ CALL XERBLA( 'CUNGBR', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -327,7 +328,7 @@ END IF END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNGBR diff --git a/lapack-netlib/SRC/cunghr.f b/lapack-netlib/SRC/cunghr.f index 4f8a0a263..3aa3fb1ae 100644 --- a/lapack-netlib/SRC/cunghr.f +++ b/lapack-netlib/SRC/cunghr.f @@ -119,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unghr * * ===================================================================== SUBROUTINE CUNGHR( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO ) @@ -151,7 +151,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -178,7 +179,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'CUNGQR', ' ', NH, NH, NH, -1 ) LWKOPT = MAX( 1, NH )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -230,7 +231,7 @@ CALL CUNGQR( NH, NH, NH, A( ILO+1, ILO+1 ), LDA, TAU( ILO ), $ WORK, LWORK, IINFO ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNGHR diff --git a/lapack-netlib/SRC/cunglq.f b/lapack-netlib/SRC/cunglq.f index e250e036c..353715054 100644 --- a/lapack-netlib/SRC/cunglq.f +++ b/lapack-netlib/SRC/cunglq.f @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unglq * * ===================================================================== SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -155,7 +155,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -164,7 +165,7 @@ INFO = 0 NB = ILAENV( 1, 'CUNGLQ', ' ', M, N, K, -1 ) LWKOPT = MAX( 1, M )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -278,7 +279,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CUNGLQ diff --git a/lapack-netlib/SRC/cungql.f b/lapack-netlib/SRC/cungql.f index d3b812a62..ed2f6803c 100644 --- a/lapack-netlib/SRC/cungql.f +++ b/lapack-netlib/SRC/cungql.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ungql * * ===================================================================== SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -181,7 +182,7 @@ NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN INFO = -8 diff --git a/lapack-netlib/SRC/cungqr.f b/lapack-netlib/SRC/cungqr.f index 5010ae0df..b6e8cc59a 100644 --- a/lapack-netlib/SRC/cungqr.f +++ b/lapack-netlib/SRC/cungqr.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ungqr * * ===================================================================== SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -165,7 +166,7 @@ INFO = 0 NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 ) LWKOPT = MAX( 1, N )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -279,7 +280,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CUNGQR diff --git a/lapack-netlib/SRC/cungrq.f b/lapack-netlib/SRC/cungrq.f index 1593ff938..aceaac0b8 100644 --- a/lapack-netlib/SRC/cungrq.f +++ b/lapack-netlib/SRC/cungrq.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ungrq * * ===================================================================== SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -181,7 +182,7 @@ NB = ILAENV( 1, 'CUNGRQ', ' ', M, N, K, -1 ) LWKOPT = M*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN INFO = -8 @@ -286,7 +287,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of CUNGRQ diff --git a/lapack-netlib/SRC/cungtr.f b/lapack-netlib/SRC/cungtr.f index 26ff0428e..27f197340 100644 --- a/lapack-netlib/SRC/cungtr.f +++ b/lapack-netlib/SRC/cungtr.f @@ -116,7 +116,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ungtr * * ===================================================================== SUBROUTINE CUNGTR( UPLO, N, A, LDA, TAU, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CUNGQL, CUNGQR, XERBLA @@ -179,7 +180,7 @@ NB = ILAENV( 1, 'CUNGQR', ' ', N-1, N-1, N-1, -1 ) END IF LWKOPT = MAX( 1, N-1 )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -245,7 +246,7 @@ $ LWORK, IINFO ) END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNGTR diff --git a/lapack-netlib/SRC/cunmbr.f b/lapack-netlib/SRC/cunmbr.f index cef6025b0..a21c486e9 100644 --- a/lapack-netlib/SRC/cunmbr.f +++ b/lapack-netlib/SRC/cunmbr.f @@ -189,7 +189,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmbr * * ===================================================================== SUBROUTINE CUNMBR( VECT, SIDE, TRANS, M, N, K, A, LDA, TAU, C, @@ -218,7 +218,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CUNMLQ, CUNMQR, XERBLA @@ -290,7 +291,7 @@ ELSE LWKOPT = 1 END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -367,7 +368,7 @@ $ TAU, C( I1, I2 ), LDC, WORK, LWORK, IINFO ) END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMBR diff --git a/lapack-netlib/SRC/cunmhr.f b/lapack-netlib/SRC/cunmhr.f index af3140d5f..29bb631f1 100644 --- a/lapack-netlib/SRC/cunmhr.f +++ b/lapack-netlib/SRC/cunmhr.f @@ -171,7 +171,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmhr * * ===================================================================== SUBROUTINE CUNMHR( SIDE, TRANS, M, N, ILO, IHI, A, LDA, TAU, C, @@ -199,7 +199,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CUNMQR, XERBLA @@ -253,7 +254,7 @@ NB = ILAENV( 1, 'CUNMQR', SIDE // TRANS, M, NH, NH, -1 ) END IF LWKOPT = NW*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -285,7 +286,7 @@ CALL CUNMQR( SIDE, TRANS, MI, NI, NH, A( ILO+1, ILO ), LDA, $ TAU( ILO ), C( I1, I2 ), LDC, WORK, LWORK, IINFO ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMHR diff --git a/lapack-netlib/SRC/cunmlq.f b/lapack-netlib/SRC/cunmlq.f index 25a410770..4da1af1d5 100644 --- a/lapack-netlib/SRC/cunmlq.f +++ b/lapack-netlib/SRC/cunmlq.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmlq * * ===================================================================== SUBROUTINE CUNMLQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -195,7 +195,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARFB, CLARFT, CUNML2, XERBLA @@ -250,7 +251,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -343,7 +344,7 @@ $ C( IC, JC ), LDC, WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMLQ diff --git a/lapack-netlib/SRC/cunmql.f b/lapack-netlib/SRC/cunmql.f index 3c7166066..84fc29d32 100644 --- a/lapack-netlib/SRC/cunmql.f +++ b/lapack-netlib/SRC/cunmql.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmql * * ===================================================================== SUBROUTINE CUNMQL( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -194,7 +194,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARFB, CLARFT, CUNM2L, XERBLA @@ -249,7 +250,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -332,7 +333,7 @@ $ WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMQL diff --git a/lapack-netlib/SRC/cunmqr.f b/lapack-netlib/SRC/cunmqr.f index 7e59d7129..7d85a861f 100644 --- a/lapack-netlib/SRC/cunmqr.f +++ b/lapack-netlib/SRC/cunmqr.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmqr * * ===================================================================== SUBROUTINE CUNMQR( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -194,7 +194,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARFB, CLARFT, CUNM2R, XERBLA @@ -245,7 +246,7 @@ NB = MIN( NBMAX, ILAENV( 1, 'CUNMQR', SIDE // TRANS, M, N, K, $ -1 ) ) LWKOPT = NW*NB + TSIZE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -331,7 +332,7 @@ $ C( IC, JC ), LDC, WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMQR diff --git a/lapack-netlib/SRC/cunmrq.f b/lapack-netlib/SRC/cunmrq.f index 5a233f604..f02cfd9a9 100644 --- a/lapack-netlib/SRC/cunmrq.f +++ b/lapack-netlib/SRC/cunmrq.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmrq * * ===================================================================== SUBROUTINE CUNMRQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -195,7 +195,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARFB, CLARFT, CUNMR2, XERBLA @@ -250,7 +251,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -337,7 +338,7 @@ $ WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMRQ diff --git a/lapack-netlib/SRC/cunmrz.f b/lapack-netlib/SRC/cunmrz.f index 8e06f2329..9ccf1878b 100644 --- a/lapack-netlib/SRC/cunmrz.f +++ b/lapack-netlib/SRC/cunmrz.f @@ -168,7 +168,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmrz * *> \par Contributors: * ================== @@ -213,7 +213,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLARZB, CLARZT, CUNMR3, XERBLA @@ -271,7 +272,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -371,7 +372,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/cunmtr.f b/lapack-netlib/SRC/cunmtr.f index 097dba91e..6eafc15c4 100644 --- a/lapack-netlib/SRC/cunmtr.f +++ b/lapack-netlib/SRC/cunmtr.f @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unmtr * * ===================================================================== SUBROUTINE CUNMTR( SIDE, UPLO, TRANS, M, N, A, LDA, TAU, C, LDC, @@ -192,7 +192,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CUNMQL, CUNMQR, XERBLA @@ -256,7 +257,7 @@ END IF END IF LWKOPT = NW*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -301,7 +302,7 @@ CALL CUNMQR( SIDE, TRANS, MI, NI, NQ-1, A( 2, 1 ), LDA, TAU, $ C( I1, I2 ), LDC, WORK, LWORK, IINFO ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of CUNMTR From f5664740cd492d9f7c614c4876a9204c40fdf777 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 14:29:04 +0100 Subject: [PATCH 405/718] Apply ROUNDUP_LWORK (Reference-LAPACK PR 904) --- lapack-netlib/SRC/sgebrd.f | 11 ++++++----- lapack-netlib/SRC/sgees.f | 15 +++++++-------- lapack-netlib/SRC/sgeesx.f | 13 ++++++------- lapack-netlib/SRC/sgeev.f | 16 +++++++--------- lapack-netlib/SRC/sgeevx.f | 13 ++++++------- lapack-netlib/SRC/sgehrd.f | 9 +++++---- lapack-netlib/SRC/sgelq.f | 11 +++++++---- lapack-netlib/SRC/sgelqf.f | 9 +++++---- lapack-netlib/SRC/sgels.f | 15 +++++++-------- lapack-netlib/SRC/sgelsd.f | 15 +++++++-------- lapack-netlib/SRC/sgelss.f | 8 ++++---- lapack-netlib/SRC/sgelst.f | 19 +++++++++---------- lapack-netlib/SRC/sgelsy.f | 13 ++++++------- lapack-netlib/SRC/sgemlq.f | 16 +++++++++++----- lapack-netlib/SRC/sgemqr.f | 15 +++++++++------ lapack-netlib/SRC/sgeqlf.f | 9 +++++---- lapack-netlib/SRC/sgeqp3.f | 10 +++++----- lapack-netlib/SRC/sgeqrf.f | 9 +++++---- lapack-netlib/SRC/sgeqrfp.f | 9 +++++---- lapack-netlib/SRC/sgerqf.f | 9 +++++---- lapack-netlib/SRC/sgesvd.f | 10 +++++----- lapack-netlib/SRC/sgesvdx.f | 10 +++++----- lapack-netlib/SRC/sgetri.f | 9 +++++---- lapack-netlib/SRC/sgetsls.f | 15 +++++++-------- lapack-netlib/SRC/sgetsqrhrt.f | 12 ++++++++---- lapack-netlib/SRC/sgges.f | 16 +++++++--------- lapack-netlib/SRC/sgges3.f | 16 +++++++--------- lapack-netlib/SRC/sggesx.f | 16 +++++++--------- lapack-netlib/SRC/sggev.f | 16 +++++++--------- lapack-netlib/SRC/sggev3.f | 16 +++++++--------- lapack-netlib/SRC/sggevx.f | 17 ++++++++--------- lapack-netlib/SRC/sggglm.f | 7 ++++--- lapack-netlib/SRC/sgghd3.f | 11 ++++++----- lapack-netlib/SRC/sgglse.f | 7 ++++--- lapack-netlib/SRC/sggqrf.f | 10 ++++++---- lapack-netlib/SRC/sggrqf.f | 10 ++++++---- lapack-netlib/SRC/sggsvd3.f | 10 +++++----- lapack-netlib/SRC/sggsvp3.f | 9 +++++---- lapack-netlib/SRC/shgeqz.f | 9 +++++---- lapack-netlib/SRC/shseqr.f | 7 ++++--- lapack-netlib/SRC/slaqr2.f | 13 ++++++------- lapack-netlib/SRC/slaqr3.f | 16 +++++++--------- lapack-netlib/SRC/slaqr4.f | 11 ++++++----- lapack-netlib/SRC/slaqz0.f | 6 +++--- lapack-netlib/SRC/slaqz3.f | 9 ++++----- lapack-netlib/SRC/slaqz4.f | 5 +++-- lapack-netlib/SRC/slaswlq.f | 7 +++++-- lapack-netlib/SRC/sorgbr.f | 9 +++++---- 48 files changed, 284 insertions(+), 269 deletions(-) diff --git a/lapack-netlib/SRC/sgebrd.f b/lapack-netlib/SRC/sgebrd.f index 08701164c..2d0c6d651 100644 --- a/lapack-netlib/SRC/sgebrd.f +++ b/lapack-netlib/SRC/sgebrd.f @@ -147,7 +147,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -230,11 +230,12 @@ EXTERNAL SGEBD2, SGEMM, SLABRD, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN, REAL + INTRINSIC MAX, MIN * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -243,7 +244,7 @@ INFO = 0 NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) LWKOPT = ( M+N )*NB - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -341,7 +342,7 @@ * CALL SGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = WS + WORK( 1 ) = SROUNDUP_LWORK(WS) RETURN * * End of SGEBRD diff --git a/lapack-netlib/SRC/sgees.f b/lapack-netlib/SRC/sgees.f index 6febd549c..4418ea064 100644 --- a/lapack-netlib/SRC/sgees.f +++ b/lapack-netlib/SRC/sgees.f @@ -208,7 +208,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup gees * * ===================================================================== SUBROUTINE SGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, WR, WI, @@ -250,14 +250,14 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, - $ SLACPY, SLASCL, SORGHR, SSWAP, STRSEN, XERBLA + EXTERNAL SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY, + $ SLASCL, SORGHR, SSWAP, STRSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -312,7 +312,7 @@ MAXWRK = MAX( MAXWRK, N + HSWORK ) END IF END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -13 @@ -338,7 +338,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -524,7 +523,7 @@ 30 CONTINUE END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGEES diff --git a/lapack-netlib/SRC/sgeesx.f b/lapack-netlib/SRC/sgeesx.f index 6810fe7c8..cabe9f1f7 100644 --- a/lapack-netlib/SRC/sgeesx.f +++ b/lapack-netlib/SRC/sgeesx.f @@ -272,7 +272,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup geesx * * ===================================================================== SUBROUTINE SGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM, @@ -317,14 +317,14 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, + EXTERNAL SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, $ SLACPY, SLASCL, SORGHR, SSWAP, STRSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -398,7 +398,7 @@ $ LIWRK = ( N*N )/4 END IF IWORK( 1 ) = LIWRK - WORK( 1 ) = LWRK + WORK( 1 ) = SROUNDUP_LWORK(LWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -16 @@ -426,7 +426,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -634,7 +633,7 @@ 30 CONTINUE END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IF( WANTSV .OR. WANTSB ) THEN IWORK( 1 ) = SDIM*(N-SDIM) ELSE diff --git a/lapack-netlib/SRC/sgeev.f b/lapack-netlib/SRC/sgeev.f index ed1724721..93f993265 100644 --- a/lapack-netlib/SRC/sgeev.f +++ b/lapack-netlib/SRC/sgeev.f @@ -184,7 +184,7 @@ * * @generated from dgeev.f, fortran d -> s, Tue Apr 19 01:47:44 2016 * -*> \ingroup realGEeigen +*> \ingroup geev * * ===================================================================== SUBROUTINE SGEEV( JOBVL, JOBVR, N, A, LDA, WR, WI, VL, LDVL, VR, @@ -223,16 +223,15 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, SLACPY, - $ SLARTG, SLASCL, SORGHR, SROT, SSCAL, STREVC3, - $ XERBLA + EXTERNAL SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY, SLARTG, + $ SLASCL, SORGHR, SROT, SSCAL, STREVC3, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX, ILAENV - REAL SLAMCH, SLANGE, SLAPY2, SNRM2 + REAL SLAMCH, SLANGE, SLAPY2, SNRM2, SROUNDUP_LWORK EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SLANGE, SLAPY2, - $ SNRM2 + $ SNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -312,7 +311,7 @@ END IF MAXWRK = MAX( MAXWRK, MINWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -13 @@ -336,7 +335,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -519,7 +517,7 @@ END IF END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGEEV diff --git a/lapack-netlib/SRC/sgeevx.f b/lapack-netlib/SRC/sgeevx.f index ed1ea1cb9..b0af78605 100644 --- a/lapack-netlib/SRC/sgeevx.f +++ b/lapack-netlib/SRC/sgeevx.f @@ -297,7 +297,7 @@ * * @generated from dgeevx.f, fortran d -> s, Tue Apr 19 01:47:44 2016 * -*> \ingroup realGEeigen +*> \ingroup geevx * * ===================================================================== SUBROUTINE SGEEVX( BALANC, JOBVL, JOBVR, SENSE, N, A, LDA, WR, WI, @@ -341,16 +341,16 @@ REAL DUM( 1 ) * .. * .. External Subroutines .. - EXTERNAL SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, SLACPY, + EXTERNAL SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY, $ SLARTG, SLASCL, SORGHR, SROT, SSCAL, STREVC3, $ STRSNA, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX, ILAENV - REAL SLAMCH, SLANGE, SLAPY2, SNRM2 + REAL SLAMCH, SLANGE, SLAPY2, SNRM2, SROUNDUP_LWORK EXTERNAL LSAME, ISAMAX, ILAENV, SLAMCH, SLANGE, SLAPY2, - $ SNRM2 + $ SNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -453,7 +453,7 @@ END IF MAXWRK = MAX( MAXWRK, MINWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -21 @@ -477,7 +477,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -684,7 +683,7 @@ END IF END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGEEVX diff --git a/lapack-netlib/SRC/sgehrd.f b/lapack-netlib/SRC/sgehrd.f index 41b9aa78e..47733d947 100644 --- a/lapack-netlib/SRC/sgehrd.f +++ b/lapack-netlib/SRC/sgehrd.f @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -201,7 +201,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -227,7 +228,7 @@ * NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, -1 ) ) LWKOPT = N*NB + TSIZE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -344,7 +345,7 @@ * Use unblocked code to reduce the rest of the matrix * CALL SGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f index 9209f918e..74c7cc267 100644 --- a/lapack-netlib/SRC/sgelq.f +++ b/lapack-netlib/SRC/sgelq.f @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -190,7 +192,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGELQT, SLASWLQ, XERBLA @@ -292,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE - WORK( 1 ) = LWREQ + WORK( 1 ) = SROUNDUP_LWORK(LWREQ) END IF END IF IF( INFO.NE.0 ) THEN @@ -319,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = LWREQ + WORK( 1 ) = SROUNDUP_LWORK(LWREQ) RETURN * * End of SGELQ diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f index 24d8ab19c..1ceec4742 100644 --- a/lapack-netlib/SRC/sgelqf.f +++ b/lapack-netlib/SRC/sgelqf.f @@ -118,7 +118,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -167,7 +167,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -176,7 +177,7 @@ INFO = 0 NB = ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 ) LWKOPT = M*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -266,7 +267,7 @@ $ CALL SGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGELQF diff --git a/lapack-netlib/SRC/sgels.f b/lapack-netlib/SRC/sgels.f index ea02c3318..b58f70c9e 100644 --- a/lapack-netlib/SRC/sgels.f +++ b/lapack-netlib/SRC/sgels.f @@ -175,7 +175,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup gels * * ===================================================================== SUBROUTINE SGELS( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, @@ -210,15 +210,15 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. - EXTERNAL SGELQF, SGEQRF, SLABAD, SLASCL, SLASET, SORMLQ, + EXTERNAL SGELQF, SGEQRF, SLASCL, SLASET, SORMLQ, $ SORMQR, STRTRS, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN, REAL + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -273,7 +273,7 @@ END IF * WSIZE = MAX( 1, MN + MAX( MN, NRHS )*NB ) - WORK( 1 ) = REAL( WSIZE ) + WORK( 1 ) = SROUNDUP_LWORK( WSIZE ) * END IF * @@ -295,7 +295,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * @@ -492,7 +491,7 @@ END IF * 50 CONTINUE - WORK( 1 ) = REAL( WSIZE ) + WORK( 1 ) = SROUNDUP_LWORK( WSIZE ) * RETURN * diff --git a/lapack-netlib/SRC/sgelsd.f b/lapack-netlib/SRC/sgelsd.f index 9fda7b593..2818213f4 100644 --- a/lapack-netlib/SRC/sgelsd.f +++ b/lapack-netlib/SRC/sgelsd.f @@ -189,7 +189,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup gelsd * *> \par Contributors: * ================== @@ -229,13 +229,13 @@ REAL ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM * .. * .. External Subroutines .. - EXTERNAL SGEBRD, SGELQF, SGEQRF, SLABAD, SLACPY, SLALSD, - $ SLASCL, SLASET, SORMBR, SORMLQ, SORMQR, XERBLA + EXTERNAL SGEBRD, SGELQF, SGEQRF, SLACPY, SLALSD, SLASCL, + $ SLASET, SORMBR, SORMLQ, SORMQR, XERBLA * .. * .. External Functions .. INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL SLAMCH, SLANGE, ILAENV + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL SLAMCH, SLANGE, ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, LOG, MAX, MIN, REAL @@ -348,7 +348,7 @@ END IF END IF MINWRK = MIN( MINWRK, MAXWRK ) - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWORK * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN @@ -376,7 +376,6 @@ SFMIN = SLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A if max entry outside range [SMLNUM,BIGNUM]. * @@ -615,7 +614,7 @@ END IF * 10 CONTINUE - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWORK RETURN * diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f index 89d3a6e4f..2e4b0cdd5 100644 --- a/lapack-netlib/SRC/sgelss.f +++ b/lapack-netlib/SRC/sgelss.f @@ -207,8 +207,8 @@ * .. * .. External Functions .. INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -355,7 +355,7 @@ END IF MAXWRK = MAX( MINWRK, MAXWRK ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) $ INFO = -12 @@ -731,7 +731,7 @@ END IF * 70 CONTINUE - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGELSS diff --git a/lapack-netlib/SRC/sgelst.f b/lapack-netlib/SRC/sgelst.f index 5377bc720..b89918656 100644 --- a/lapack-netlib/SRC/sgelst.f +++ b/lapack-netlib/SRC/sgelst.f @@ -176,7 +176,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup gelst * *> \par Contributors: * ================== @@ -222,15 +222,15 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. - EXTERNAL SGELQT, SGEQRT, SGEMLQT, SGEMQRT, SLABAD, + EXTERNAL SGELQT, SGEQRT, SGEMLQT, SGEMQRT, $ SLASCL, SLASET, STRTRS, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -268,7 +268,7 @@ * MNNRHS = MAX( MN, NRHS ) LWOPT = MAX( 1, (MN+MNNRHS)*NB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) * END IF * @@ -283,7 +283,7 @@ * IF( MIN( M, N, NRHS ).EQ.0 ) THEN CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) RETURN END IF * @@ -309,7 +309,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * @@ -332,7 +331,7 @@ * Matrix all zero. Return zero solution. * CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) RETURN END IF * @@ -522,7 +521,7 @@ $ INFO ) END IF * - WORK( 1 ) = REAL( LWOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgelsy.f b/lapack-netlib/SRC/sgelsy.f index 89dd39e80..c7f5069de 100644 --- a/lapack-netlib/SRC/sgelsy.f +++ b/lapack-netlib/SRC/sgelsy.f @@ -191,7 +191,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup gelsy * *> \par Contributors: * ================== @@ -234,11 +234,11 @@ * .. * .. External Functions .. INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEQP3, SLABAD, SLAIC1, SLASCL, SLASET, + EXTERNAL SCOPY, SGEQP3, SLAIC1, SLASCL, SLASET, $ SORMQR, SORMRZ, STRSM, STZRZF, XERBLA * .. * .. Intrinsic Functions .. @@ -282,7 +282,7 @@ LWKOPT = MAX( LWKMIN, $ MN + 2*N + NB*( N + 1 ), 2*MN + NB*NRHS ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -12 @@ -307,7 +307,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max entries outside range [SMLNUM,BIGNUM] * @@ -469,7 +468,7 @@ END IF * 70 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f index 536abf184..83536825c 100644 --- a/lapack-netlib/SRC/sgemlq.f +++ b/lapack-netlib/SRC/sgemlq.f @@ -119,7 +119,7 @@ *> The dimension of the array WORK. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -143,7 +143,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -159,11 +159,13 @@ *> block sizes MB and NB returned by ILAENV, SGELQ will use either *> SLASWLQ (if the matrix is wide-and-short) or SGELQT to compute *> the LQ factorization. -*> This version of SGEMLQ will use either SLAMSWLQ or SGEMLQT to +*> This version of SGEMLQ will use either SLAMSWLQ or SGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in SLAMSWLQ or SGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE SGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -191,6 +193,10 @@ LOGICAL LSAME EXTERNAL LSAME * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SLAMSWLQ, SGEMLQT, XERBLA * .. @@ -249,7 +255,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LW ) END IF * IF( INFO.NE.0 ) THEN @@ -274,7 +280,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LW ) * RETURN * diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f index 2a9257459..3207f8bfd 100644 --- a/lapack-netlib/SRC/sgemqr.f +++ b/lapack-netlib/SRC/sgemqr.f @@ -120,7 +120,7 @@ *> The dimension of the array WORK. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +144,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,12 +160,14 @@ *> block sizes MB and NB returned by ILAENV, SGEQR will use either *> SLATSQR (if the matrix is tall-and-skinny) or SGEQRT to compute *> the QR factorization. -*> This version of SGEMQR will use either SLAMTSQR or SGEMQRT to +*> This version of SGEMQR will use either SLAMTSQR or SGEMQRT to *> multiply matrix Q by another matrix. *> Further Details in SLAMTSQR or SGEMQRT. *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE SGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -191,7 +193,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMQRT, SLAMTSQR, XERBLA @@ -251,7 +254,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK(LW) END IF * IF( INFO.NE.0 ) THEN @@ -276,7 +279,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK(LW) * RETURN * diff --git a/lapack-netlib/SRC/sgeqlf.f b/lapack-netlib/SRC/sgeqlf.f index efecfbb3c..b1266c89e 100644 --- a/lapack-netlib/SRC/sgeqlf.f +++ b/lapack-netlib/SRC/sgeqlf.f @@ -113,7 +113,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -162,7 +162,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -186,7 +187,7 @@ NB = ILAENV( 1, 'SGEQLF', ' ', M, N, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN INFO = -7 @@ -276,7 +277,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL SGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGEQLF diff --git a/lapack-netlib/SRC/sgeqp3.f b/lapack-netlib/SRC/sgeqp3.f index 493bdae6a..9f2f40b2e 100644 --- a/lapack-netlib/SRC/sgeqp3.f +++ b/lapack-netlib/SRC/sgeqp3.f @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup geqp3 * *> \par Further Details: * ===================== @@ -177,8 +177,8 @@ * .. * .. External Functions .. INTEGER ILAENV - REAL SNRM2 - EXTERNAL ILAENV, SNRM2 + REAL SNRM2, SROUNDUP_LWORK + EXTERNAL ILAENV, SNRM2, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -205,7 +205,7 @@ NB = ILAENV( INB, 'SGEQRF', ' ', M, N, -1, -1 ) LWKOPT = 2*N + ( N + 1 )*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN INFO = -8 @@ -347,7 +347,7 @@ * END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGEQP3 diff --git a/lapack-netlib/SRC/sgeqrf.f b/lapack-netlib/SRC/sgeqrf.f index b24615f7a..689fe1aea 100644 --- a/lapack-netlib/SRC/sgeqrf.f +++ b/lapack-netlib/SRC/sgeqrf.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup geqrf * *> \par Further Details: * ===================== @@ -170,7 +170,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -199,7 +200,7 @@ ELSE LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -274,7 +275,7 @@ $ CALL SGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGEQRF diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f index 03d33654b..d1ee2a828 100644 --- a/lapack-netlib/SRC/sgeqrfp.f +++ b/lapack-netlib/SRC/sgeqrfp.f @@ -122,7 +122,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -173,7 +173,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -182,7 +183,7 @@ INFO = 0 NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) LWKOPT = N*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -272,7 +273,7 @@ $ CALL SGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGEQRFP diff --git a/lapack-netlib/SRC/sgerqf.f b/lapack-netlib/SRC/sgerqf.f index 037cd5345..1d3400a1f 100644 --- a/lapack-netlib/SRC/sgerqf.f +++ b/lapack-netlib/SRC/sgerqf.f @@ -114,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gerqf * *> \par Further Details: * ===================== @@ -163,7 +163,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -187,7 +188,7 @@ NB = ILAENV( 1, 'SGERQF', ' ', M, N, -1, -1 ) LWKOPT = M*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF ( .NOT.LQUERY ) THEN IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) @@ -278,7 +279,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL SGERQ2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGERQF diff --git a/lapack-netlib/SRC/sgesvd.f b/lapack-netlib/SRC/sgesvd.f index 83321ffaa..d3fa94582 100644 --- a/lapack-netlib/SRC/sgesvd.f +++ b/lapack-netlib/SRC/sgesvd.f @@ -203,7 +203,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsing +*> \ingroup gesvd * * ===================================================================== SUBROUTINE SGESVD( JOBU, JOBVT, M, N, A, LDA, S, U, LDU, VT, LDVT, @@ -251,8 +251,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -628,7 +628,7 @@ END IF END IF MAXWRK = MAX( MAXWRK, MINWRK ) - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -13 @@ -3493,7 +3493,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * RETURN * diff --git a/lapack-netlib/SRC/sgesvdx.f b/lapack-netlib/SRC/sgesvdx.f index b6495dbd4..8b55b9b2e 100644 --- a/lapack-netlib/SRC/sgesvdx.f +++ b/lapack-netlib/SRC/sgesvdx.f @@ -254,7 +254,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsing +*> \ingroup gesvdx * * ===================================================================== SUBROUTINE SGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU, @@ -301,8 +301,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -456,7 +456,7 @@ END IF END IF MAXWRK = MAX( MAXWRK, MINWRK ) - WORK( 1 ) = REAL( MAXWRK ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -19 @@ -822,7 +822,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = REAL( MAXWRK ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * RETURN * diff --git a/lapack-netlib/SRC/sgetri.f b/lapack-netlib/SRC/sgetri.f index 749ede9a7..fe71bc4a5 100644 --- a/lapack-netlib/SRC/sgetri.f +++ b/lapack-netlib/SRC/sgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE SGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -137,7 +137,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA @@ -152,7 +153,7 @@ INFO = 0 NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 ) LWKOPT = N*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -250,7 +251,7 @@ $ CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SGETRI diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f index e6ce705fa..d89c6a4e6 100644 --- a/lapack-netlib/SRC/sgetsls.f +++ b/lapack-netlib/SRC/sgetsls.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE SGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -188,15 +188,15 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANGE - EXTERNAL LSAME, SLABAD, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEQR, SGEMQR, SLASCL, SLASET, $ STRTRS, XERBLA, SGELQ, SGEMLQ * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN, INT + INTRINSIC MAX, MIN, INT * .. * .. Executable Statements .. * @@ -262,7 +262,7 @@ INFO = -10 END IF * - WORK( 1 ) = REAL( WSIZEO ) + WORK( 1 ) = SROUNDUP_LWORK( WSIZEO ) * END IF * @@ -271,7 +271,7 @@ RETURN END IF IF( LQUERY ) THEN - IF( LWORK.EQ.-2 ) WORK( 1 ) = REAL( WSIZEM ) + IF( LWORK.EQ.-2 ) WORK( 1 ) = SROUNDUP_LWORK( WSIZEM ) RETURN END IF IF( LWORK.LT.WSIZEO ) THEN @@ -294,7 +294,6 @@ * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * @@ -482,7 +481,7 @@ END IF * 50 CONTINUE - WORK( 1 ) = REAL( TSZO + LWO ) + WORK( 1 ) = SROUNDUP_LWORK( TSZO + LWO ) RETURN * * End of SGETSLS diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f index f9580da7b..d80ff4da8 100644 --- a/lapack-netlib/SRC/sgetsqrhrt.f +++ b/lapack-netlib/SRC/sgetsqrhrt.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup singleOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -200,6 +200,10 @@ INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL, $ XERBLA @@ -277,14 +281,14 @@ CALL XERBLA( 'SGETSQRHRT', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -341,7 +345,7 @@ END IF END DO * - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of SGETSQRHRT diff --git a/lapack-netlib/SRC/sgges.f b/lapack-netlib/SRC/sgges.f index 3834aea00..8f42882dd 100644 --- a/lapack-netlib/SRC/sgges.f +++ b/lapack-netlib/SRC/sgges.f @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup gges * * ===================================================================== SUBROUTINE SGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB, @@ -321,15 +321,14 @@ REAL DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN, - $ XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGSEN * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -406,7 +405,7 @@ MINWRK = 1 MAXWRK = 1 END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) $ INFO = -19 @@ -431,7 +430,6 @@ EPS = SLAMCH( 'P' ) SAFMIN = SLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * @@ -668,7 +666,7 @@ * 40 CONTINUE * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * RETURN * diff --git a/lapack-netlib/SRC/sgges3.f b/lapack-netlib/SRC/sgges3.f index b27704ff5..e35d4955a 100644 --- a/lapack-netlib/SRC/sgges3.f +++ b/lapack-netlib/SRC/sgges3.f @@ -273,7 +273,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE SGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -318,14 +318,13 @@ REAL DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN, - $ XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANGE - EXTERNAL LSAME, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -408,7 +407,7 @@ $ IERR ) LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -430,7 +429,6 @@ EPS = SLAMCH( 'P' ) SAFMIN = SLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * @@ -659,7 +657,7 @@ * 40 CONTINUE * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/sggesx.f b/lapack-netlib/SRC/sggesx.f index a6c0443ba..e5a14fc19 100644 --- a/lapack-netlib/SRC/sggesx.f +++ b/lapack-netlib/SRC/sggesx.f @@ -337,7 +337,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup ggesx * *> \par Further Details: * ===================== @@ -405,15 +405,14 @@ REAL DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN, - $ XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -510,7 +509,7 @@ MAXWRK = 1 LWRK = 1 END IF - WORK( 1 ) = LWRK + WORK( 1 ) = SROUNDUP_LWORK(LWRK) IF( WANTSN .OR. N.EQ.0 ) THEN LIWMIN = 1 ELSE @@ -544,7 +543,6 @@ EPS = SLAMCH( 'P' ) SAFMIN = SLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * @@ -807,7 +805,7 @@ * 50 CONTINUE * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/sggev.f b/lapack-netlib/SRC/sggev.f index 69744b72b..cacad7cac 100644 --- a/lapack-netlib/SRC/sggev.f +++ b/lapack-netlib/SRC/sggev.f @@ -218,7 +218,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup ggev * * ===================================================================== SUBROUTINE SGGEV( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, ALPHAI, @@ -257,15 +257,14 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC, - $ XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGEVC, XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -335,7 +334,7 @@ MAXWRK = MAX( MAXWRK, N*( 7 + $ ILAENV( 1, 'SORGQR', ' ', N, 1, N, -1 ) ) ) END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) $ INFO = -16 @@ -358,7 +357,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -581,7 +579,7 @@ CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGGEV diff --git a/lapack-netlib/SRC/sggev3.f b/lapack-netlib/SRC/sggev3.f index 945c3a017..c82d2187f 100644 --- a/lapack-netlib/SRC/sggev3.f +++ b/lapack-netlib/SRC/sggev3.f @@ -217,7 +217,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE SGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, @@ -256,14 +256,13 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC, - $ XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGEVC * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANGE - EXTERNAL LSAME, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -341,7 +340,7 @@ $ WORK, -1, 0, IERR ) LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) END IF - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * END IF * @@ -362,7 +361,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -578,7 +576,7 @@ CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) END IF * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SGGEV3 diff --git a/lapack-netlib/SRC/sggevx.f b/lapack-netlib/SRC/sggevx.f index bb05f499a..63164a021 100644 --- a/lapack-netlib/SRC/sggevx.f +++ b/lapack-netlib/SRC/sggevx.f @@ -352,7 +352,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEeigen +*> \ingroup ggevx * *> \par Further Details: * ===================== @@ -427,15 +427,15 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD, - $ SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC, - $ STGSNA, XERBLA + EXTERNAL SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY, + $ SLASCL, SLASET, SORGQR, SORMQR, STGEVC, STGSNA, + $ XERBLA * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT @@ -532,7 +532,7 @@ $ N*ILAENV( 1, 'SORGQR', ' ', N, 1, N, 0 ) ) END IF END IF - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -26 @@ -557,7 +557,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -855,7 +854,7 @@ CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) END IF * - WORK( 1 ) = MAXWRK + WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) RETURN * * End of SGGEVX diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f index 56b4dba52..37094e4f2 100644 --- a/lapack-netlib/SRC/sggglm.f +++ b/lapack-netlib/SRC/sggglm.f @@ -177,7 +177,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup ggglm * * ===================================================================== SUBROUTINE SGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK, @@ -212,7 +212,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -251,7 +252,7 @@ LWKMIN = M + N + P LWKOPT = M + NP + MAX( N, P )*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -12 diff --git a/lapack-netlib/SRC/sgghd3.f b/lapack-netlib/SRC/sgghd3.f index 23acf6ec5..9c5858b5a 100644 --- a/lapack-netlib/SRC/sgghd3.f +++ b/lapack-netlib/SRC/sgghd3.f @@ -211,7 +211,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -260,14 +260,15 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGGHRD, SLARTG, SLASET, SORM22, SROT, SGEMM, $ SGEMV, STRMV, SLACPY, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX + INTRINSIC MAX * .. * .. Executable Statements .. * @@ -276,7 +277,7 @@ INFO = 0 NB = ILAENV( 1, 'SGGHD3', ' ', N, ILO, IHI, -1 ) LWKOPT = MAX( 6*N*NB, 1 ) - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) INITZ = LSAME( COMPZ, 'I' ) @@ -885,7 +886,7 @@ IF ( JCOL.LT.IHI ) $ CALL SGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgglse.f b/lapack-netlib/SRC/sgglse.f index 59addc3f4..53e3f8e45 100644 --- a/lapack-netlib/SRC/sgglse.f +++ b/lapack-netlib/SRC/sgglse.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERsolve +*> \ingroup gglse * * ===================================================================== SUBROUTINE SGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK, @@ -207,7 +207,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -246,7 +247,7 @@ LWKMIN = M + N + P LWKOPT = P + MN + MAX( M, N )*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -12 diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f index 59b498da5..ebb42a899 100644 --- a/lapack-netlib/SRC/sggqrf.f +++ b/lapack-netlib/SRC/sggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -236,7 +236,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -251,7 +252,7 @@ NB3 = ILAENV( 1, 'SORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) LWKOPT = MAX( N, M, P )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -287,7 +288,8 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL SGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f index 8b7d4786a..2163f1ef8 100644 --- a/lapack-netlib/SRC/sggrqf.f +++ b/lapack-netlib/SRC/sggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -235,7 +235,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -250,7 +251,7 @@ NB3 = ILAENV( 1, 'SORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -287,7 +288,8 @@ * QR factorization of P-by-N matrix B: B = Z*T * CALL SGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sggsvd3.f b/lapack-netlib/SRC/sggsvd3.f index 9077f2ea8..053fff5de 100644 --- a/lapack-netlib/SRC/sggsvd3.f +++ b/lapack-netlib/SRC/sggsvd3.f @@ -328,7 +328,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== @@ -372,8 +372,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANGE - EXTERNAL LSAME, SLAMCH, SLANGE + REAL SLAMCH, SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SGGSVP3, STGSJA, XERBLA @@ -429,7 +429,7 @@ LWKOPT = N + INT( WORK( 1 ) ) LWKOPT = MAX( 2*N, LWKOPT ) LWKOPT = MAX( 1, LWKOPT ) - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -492,7 +492,7 @@ END IF 20 CONTINUE * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SGGSVD3 diff --git a/lapack-netlib/SRC/sggsvp3.f b/lapack-netlib/SRC/sggsvp3.f index 4f76b32bc..a463b9064 100644 --- a/lapack-netlib/SRC/sggsvp3.f +++ b/lapack-netlib/SRC/sggsvp3.f @@ -250,7 +250,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== @@ -300,7 +300,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEQP3, SGEQR2, SGERQ2, SLACPY, SLAPMT, @@ -365,7 +366,7 @@ CALL SGEQP3( M, N, A, LDA, IWORK, TAU, WORK, -1, INFO ) LWKOPT = MAX( LWKOPT, INT( WORK ( 1 ) ) ) LWKOPT = MAX( 1, LWKOPT ) - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -560,7 +561,7 @@ * END IF * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SGGSVP3 diff --git a/lapack-netlib/SRC/shgeqz.f b/lapack-netlib/SRC/shgeqz.f index 6543f8cb1..9ad64d2bf 100644 --- a/lapack-netlib/SRC/shgeqz.f +++ b/lapack-netlib/SRC/shgeqz.f @@ -282,7 +282,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup hgeqz * *> \par Further Details: * ===================== @@ -346,8 +346,9 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANHS, SLAPY2, SLAPY3 - EXTERNAL LSAME, SLAMCH, SLANHS, SLAPY2, SLAPY3 + REAL SLAMCH, SLANHS, SLAPY2, SLAPY3, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANHS, SLAPY2, SLAPY3, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLAG2, SLARFG, SLARTG, SLASET, SLASV2, SROT, @@ -1364,7 +1365,7 @@ * Exit (other than argument error) -- return optimal workspace size * 420 CONTINUE - WORK( 1 ) = REAL( N ) + WORK( 1 ) = SROUNDUP_LWORK( N ) RETURN * * End of SHGEQZ diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f index 3b8d4c4d8..68b9fe6bd 100644 --- a/lapack-netlib/SRC/shseqr.f +++ b/lapack-netlib/SRC/shseqr.f @@ -233,7 +233,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup hseqr * *> \par Contributors: * ================== @@ -358,7 +358,8 @@ * .. External Functions .. INTEGER ILAENV LOGICAL LSAME - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLACPY, SLAHQR, SLAQR0, SLASET, XERBLA @@ -373,7 +374,7 @@ WANTT = LSAME( JOB, 'S' ) INITZ = LSAME( COMPZ, 'I' ) WANTZ = INITZ .OR. LSAME( COMPZ, 'V' ) - WORK( 1 ) = REAL( MAX( 1, N ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N ) ) LQUERY = LWORK.EQ.-1 * INFO = 0 diff --git a/lapack-netlib/SRC/slaqr2.f b/lapack-netlib/SRC/slaqr2.f index 62c4ef5eb..caf79fd1c 100644 --- a/lapack-netlib/SRC/slaqr2.f +++ b/lapack-netlib/SRC/slaqr2.f @@ -263,7 +263,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERauxiliary +*> \ingroup laqr2 * *> \par Contributors: * ================== @@ -305,11 +305,11 @@ LOGICAL BULGE, SORTED * .. * .. External Functions .. - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEHRD, SGEMM, SLABAD, SLACPY, SLAHQR, + EXTERNAL SCOPY, SGEHRD, SGEMM, SLACPY, SLAHQR, $ SLANV2, SLARF, SLARFG, SLASET, SORMHR, STREXC * .. * .. Intrinsic Functions .. @@ -343,7 +343,7 @@ * ==== Quick return in case of workspace query. ==== * IF( LWORK.EQ.-1 ) THEN - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -362,7 +362,6 @@ * SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N ) / ULP ) * @@ -674,7 +673,7 @@ * * ==== Return optimal workspace. ==== * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * * ==== End of SLAQR2 ==== * diff --git a/lapack-netlib/SRC/slaqr3.f b/lapack-netlib/SRC/slaqr3.f index 519ccd6ed..d3ffb0f96 100644 --- a/lapack-netlib/SRC/slaqr3.f +++ b/lapack-netlib/SRC/slaqr3.f @@ -260,7 +260,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERauxiliary +*> \ingroup laqr3 * *> \par Contributors: * ================== @@ -302,14 +302,13 @@ LOGICAL BULGE, SORTED * .. * .. External Functions .. - REAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK INTEGER ILAENV - EXTERNAL SLAMCH, ILAENV + EXTERNAL SLAMCH, SROUNDUP_LWORK, ILAENV * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEHRD, SGEMM, SLABAD, SLACPY, SLAHQR, - $ SLANV2, SLAQR4, SLARF, SLARFG, SLASET, SORMHR, - $ STREXC + EXTERNAL SCOPY, SGEHRD, SGEMM, SLACPY, SLAHQR, SLANV2, + $ SLAQR4, SLARF, SLARFG, SLASET, SORMHR, STREXC * .. * .. Intrinsic Functions .. INTRINSIC ABS, INT, MAX, MIN, REAL, SQRT @@ -348,7 +347,7 @@ * ==== Quick return in case of workspace query. ==== * IF( LWORK.EQ.-1 ) THEN - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -367,7 +366,6 @@ * SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE / SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N ) / ULP ) * @@ -685,7 +683,7 @@ * * ==== Return optimal workspace. ==== * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * * ==== End of SLAQR3 ==== * diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f index 1f0a51c85..d6721df97 100644 --- a/lapack-netlib/SRC/slaqr4.f +++ b/lapack-netlib/SRC/slaqr4.f @@ -239,7 +239,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERauxiliary +*> \ingroup laqr4 * *> \par Contributors: * ================== @@ -316,7 +316,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Local Arrays .. REAL ZDUM( 1, 1 ) @@ -325,7 +326,7 @@ EXTERNAL SLACPY, SLAHQR, SLANV2, SLAQR2, SLAQR5 * .. * .. Intrinsic Functions .. - INTRINSIC ABS, INT, MAX, MIN, MOD, REAL + INTRINSIC ABS, INT, MAX, MIN, MOD * .. * .. Executable Statements .. INFO = 0 @@ -401,7 +402,7 @@ * ==== Quick return in case of workspace query. ==== * IF( LWORK.EQ.-1 ) THEN - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -732,7 +733,7 @@ * * ==== Return the optimal value of LWORK. ==== * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * * ==== End of SLAQR4 ==== * diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f index 8b2d3286e..c128093e4 100644 --- a/lapack-netlib/SRC/slaqz0.f +++ b/lapack-netlib/SRC/slaqz0.f @@ -294,7 +294,7 @@ * *> \date May 2020 * -*> \ingroup doubleGEcomputational +*> \ingroup laqz0 *> * ===================================================================== RECURSIVE SUBROUTINE SLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, @@ -331,7 +331,7 @@ * External Functions EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, $ SLARTG, SROT - REAL, EXTERNAL :: SLAMCH, SLANHS + REAL, EXTERNAL :: SLAMCH, SLANHS, SROUNDUP_LWORK LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -461,7 +461,7 @@ LWORKREQ = MAX( ITEMP1+2*NW**2, ITEMP2+2*NBR**2 ) IF ( LWORK .EQ.-1 ) THEN - WORK( 1 ) = REAL( LWORKREQ ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKREQ ) RETURN ELSE IF ( LWORK .LT. LWORKREQ ) THEN INFO = -19 diff --git a/lapack-netlib/SRC/slaqz3.f b/lapack-netlib/SRC/slaqz3.f index edb8a6012..979381364 100644 --- a/lapack-netlib/SRC/slaqz3.f +++ b/lapack-netlib/SRC/slaqz3.f @@ -228,7 +228,7 @@ * *> \date May 2020 * -*> \ingroup doubleGEcomputational +*> \ingroup laqz3 *> * ===================================================================== RECURSIVE SUBROUTINE SLAQZ3( ILSCHUR, ILQ, ILZ, N, ILO, IHI, NW, @@ -258,9 +258,9 @@ REAL :: S, SMLNUM, ULP, SAFMIN, SAFMAX, C1, S1, TEMP * External Functions - EXTERNAL :: XERBLA, STGEXC, SLABAD, SLAQZ0, SLACPY, SLASET, + EXTERNAL :: XERBLA, STGEXC, SLAQZ0, SLACPY, SLASET, $ SLAQZ2, SROT, SLARTG, SLAG2, SGEMM - REAL, EXTERNAL :: SLAMCH + REAL, EXTERNAL :: SLAMCH, SROUNDUP_LWORK INFO = 0 @@ -286,7 +286,7 @@ LWORKREQ = MAX( LWORKREQ, N*NW, 2*NW**2+N ) IF ( LWORK .EQ.-1 ) THEN * workspace query, quick return - WORK( 1 ) = LWORKREQ + WORK( 1 ) = SROUNDUP_LWORK(LWORKREQ) RETURN ELSE IF ( LWORK .LT. LWORKREQ ) THEN INFO = -26 @@ -300,7 +300,6 @@ * Get machine constants SAFMIN = SLAMCH( 'SAFE MINIMUM' ) SAFMAX = ONE/SAFMIN - CALL SLABAD( SAFMIN, SAFMAX ) ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) diff --git a/lapack-netlib/SRC/slaqz4.f b/lapack-netlib/SRC/slaqz4.f index 3c307dd47..95b2784c5 100644 --- a/lapack-netlib/SRC/slaqz4.f +++ b/lapack-netlib/SRC/slaqz4.f @@ -204,7 +204,7 @@ * *> \date May 2020 * -*> \ingroup doubleGEcomputational +*> \ingroup laqz4 *> * ===================================================================== SUBROUTINE SLAQZ4( ILSCHUR, ILQ, ILZ, N, ILO, IHI, NSHIFTS, @@ -236,6 +236,7 @@ * External functions EXTERNAL :: XERBLA, SGEMM, SLAQZ1, SLAQZ2, SLASET, SLARTG, SROT, $ SLACPY + REAL, EXTERNAL :: SROUNDUP_LWORK INFO = 0 IF ( NBLOCK_DESIRED .LT. NSHIFTS+1 ) THEN @@ -243,7 +244,7 @@ END IF IF ( LWORK .EQ.-1 ) THEN * workspace query, quick return - WORK( 1 ) = N*NBLOCK_DESIRED + WORK( 1 ) = SROUNDUP_LWORK(N*NBLOCK_DESIRED) RETURN ELSE IF ( LWORK .LT. N*NBLOCK_DESIRED ) THEN INFO = -25 diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f index 95e0ddcce..685f823a0 100644 --- a/lapack-netlib/SRC/slaswlq.f +++ b/lapack-netlib/SRC/slaswlq.f @@ -159,6 +159,8 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, $ INFO) @@ -183,7 +185,8 @@ * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. EXTERNAL SUBROUTINES .. EXTERNAL SGELQT, SGEQRT, STPLQT, STPQRT, XERBLA * .. INTRINSIC FUNCTIONS .. @@ -262,7 +265,7 @@ $ WORK, INFO ) END IF * - WORK( 1 ) = M * MB + WORK( 1 ) = SROUNDUP_LWORK(M * MB) RETURN * * End of SLASWLQ diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f index b1a5c03a2..46f4ab130 100644 --- a/lapack-netlib/SRC/sorgbr.f +++ b/lapack-netlib/SRC/sorgbr.f @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGBcomputational +*> \ingroup ungbr * * ===================================================================== SUBROUTINE SORGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -179,7 +179,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SORGLQ, SORGQR, XERBLA @@ -240,7 +241,7 @@ CALL XERBLA( 'SORGBR', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -326,7 +327,7 @@ END IF END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORGBR From 2ce67e2ada4f4d5033f83e860c2857db71eb3965 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 14:42:52 +0100 Subject: [PATCH 406/718] Apply ROUNDUP_LWORK (Reference-LAPACK PR 904) --- lapack-netlib/SRC/sorghr.f | 9 ++- lapack-netlib/SRC/sorglq.f | 9 ++- lapack-netlib/SRC/sorgql.f | 9 ++- lapack-netlib/SRC/sorgqr.f | 9 ++- lapack-netlib/SRC/sorgrq.f | 9 ++- lapack-netlib/SRC/sorgtr.f | 9 ++- lapack-netlib/SRC/sorgtsqr.f | 14 ++-- lapack-netlib/SRC/sorgtsqr_row.f | 14 ++-- lapack-netlib/SRC/sorm22.f | 11 +-- lapack-netlib/SRC/sormbr.f | 9 ++- lapack-netlib/SRC/sormhr.f | 9 ++- lapack-netlib/SRC/sormlq.f | 9 ++- lapack-netlib/SRC/sormql.f | 9 ++- lapack-netlib/SRC/sormqr.f | 9 ++- lapack-netlib/SRC/sormrq.f | 9 ++- lapack-netlib/SRC/sormrz.f | 9 ++- lapack-netlib/SRC/sormtr.f | 9 ++- lapack-netlib/SRC/ssbev_2stage.f | 25 ++++--- lapack-netlib/SRC/ssbevd.f | 10 +-- lapack-netlib/SRC/ssbevd_2stage.f | 21 +++--- lapack-netlib/SRC/ssbevx_2stage.f | 23 +++--- lapack-netlib/SRC/ssbgvd.f | 9 ++- lapack-netlib/SRC/sspevd.f | 10 +-- lapack-netlib/SRC/sspgvd.f | 9 ++- lapack-netlib/SRC/sstedc.f | 10 +-- lapack-netlib/SRC/sstemr.f | 8 +- lapack-netlib/SRC/sstevd.f | 10 +-- lapack-netlib/SRC/sstevr.f | 10 +-- lapack-netlib/SRC/ssyev.f | 10 +-- lapack-netlib/SRC/ssyev_2stage.f | 27 +++---- lapack-netlib/SRC/ssyevd.f | 10 +-- lapack-netlib/SRC/ssyevr.f | 10 +-- lapack-netlib/SRC/ssyevr_2stage.f | 27 +++---- lapack-netlib/SRC/ssyevx.f | 12 +-- lapack-netlib/SRC/ssyevx_2stage.f | 25 ++++--- lapack-netlib/SRC/ssygv.f | 9 ++- lapack-netlib/SRC/ssygv_2stage.f | 19 ++--- lapack-netlib/SRC/ssygvd.f | 9 ++- lapack-netlib/SRC/ssygvx.f | 9 ++- lapack-netlib/SRC/ssysv.f | 9 ++- lapack-netlib/SRC/ssysv_aa.f | 9 ++- lapack-netlib/SRC/ssysv_aa_2stage.f | 7 +- lapack-netlib/SRC/ssysv_rk.f | 9 ++- lapack-netlib/SRC/ssysv_rook.f | 9 ++- lapack-netlib/SRC/ssysvx.f | 10 +-- lapack-netlib/SRC/ssytrd.f | 9 ++- lapack-netlib/SRC/ssytrd_sb2st.F | 107 ++++++++++++++------------- lapack-netlib/SRC/ssytrd_sy2sb.f | 9 ++- lapack-netlib/SRC/ssytrf.f | 9 ++- lapack-netlib/SRC/ssytrf_aa.f | 9 ++- lapack-netlib/SRC/ssytrf_aa_2stage.f | 7 +- lapack-netlib/SRC/ssytrf_rk.f | 9 ++- lapack-netlib/SRC/ssytrf_rook.f | 9 ++- lapack-netlib/SRC/ssytri_3.f | 9 ++- lapack-netlib/SRC/ssytrs_aa.f | 7 +- lapack-netlib/SRC/stgexc.f | 8 +- lapack-netlib/SRC/stgsen.f | 10 +-- lapack-netlib/SRC/stgsna.f | 11 +-- lapack-netlib/SRC/stgsyl.f | 9 ++- lapack-netlib/SRC/strsen.f | 10 +-- lapack-netlib/SRC/stzrzf.f | 9 ++- 61 files changed, 414 insertions(+), 357 deletions(-) diff --git a/lapack-netlib/SRC/sorghr.f b/lapack-netlib/SRC/sorghr.f index f65cd898c..624ede282 100644 --- a/lapack-netlib/SRC/sorghr.f +++ b/lapack-netlib/SRC/sorghr.f @@ -119,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unghr * * ===================================================================== SUBROUTINE SORGHR( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO ) @@ -150,7 +150,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -177,7 +178,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'SORGQR', ' ', NH, NH, NH, -1 ) LWKOPT = MAX( 1, NH )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -229,7 +230,7 @@ CALL SORGQR( NH, NH, NH, A( ILO+1, ILO+1 ), LDA, TAU( ILO ), $ WORK, LWORK, IINFO ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORGHR diff --git a/lapack-netlib/SRC/sorglq.f b/lapack-netlib/SRC/sorglq.f index b1d107964..30f6d5d48 100644 --- a/lapack-netlib/SRC/sorglq.f +++ b/lapack-netlib/SRC/sorglq.f @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unglq * * ===================================================================== SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -155,7 +155,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -164,7 +165,7 @@ INFO = 0 NB = ILAENV( 1, 'SORGLQ', ' ', M, N, K, -1 ) LWKOPT = MAX( 1, M )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -278,7 +279,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SORGLQ diff --git a/lapack-netlib/SRC/sorgql.f b/lapack-netlib/SRC/sorgql.f index 34ab5edef..f104e64b2 100644 --- a/lapack-netlib/SRC/sorgql.f +++ b/lapack-netlib/SRC/sorgql.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ungql * * ===================================================================== SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -181,7 +182,7 @@ NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN INFO = -8 @@ -285,7 +286,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SORGQL diff --git a/lapack-netlib/SRC/sorgqr.f b/lapack-netlib/SRC/sorgqr.f index 056de54d7..a87ea6c65 100644 --- a/lapack-netlib/SRC/sorgqr.f +++ b/lapack-netlib/SRC/sorgqr.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ungqr * * ===================================================================== SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -165,7 +166,7 @@ INFO = 0 NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 ) LWKOPT = MAX( 1, N )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -279,7 +280,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SORGQR diff --git a/lapack-netlib/SRC/sorgrq.f b/lapack-netlib/SRC/sorgrq.f index d9b6ccbe6..331f20904 100644 --- a/lapack-netlib/SRC/sorgrq.f +++ b/lapack-netlib/SRC/sorgrq.f @@ -121,7 +121,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ungrq * * ===================================================================== SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -156,7 +156,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -181,7 +182,7 @@ NB = ILAENV( 1, 'SORGRQ', ' ', M, N, K, -1 ) LWKOPT = M*NB END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN INFO = -8 @@ -285,7 +286,7 @@ 50 CONTINUE END IF * - WORK( 1 ) = IWS + WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * * End of SORGRQ diff --git a/lapack-netlib/SRC/sorgtr.f b/lapack-netlib/SRC/sorgtr.f index 67bde00cb..6a1dc3034 100644 --- a/lapack-netlib/SRC/sorgtr.f +++ b/lapack-netlib/SRC/sorgtr.f @@ -116,7 +116,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup ungtr * * ===================================================================== SUBROUTINE SORGTR( UPLO, N, A, LDA, TAU, WORK, LWORK, INFO ) @@ -146,7 +146,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SORGQL, SORGQR, XERBLA @@ -178,7 +179,7 @@ NB = ILAENV( 1, 'SORGQR', ' ', N-1, N-1, N-1, -1 ) END IF LWKOPT = MAX( 1, N-1 )*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -244,7 +245,7 @@ $ LWORK, IINFO ) END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORGTR diff --git a/lapack-netlib/SRC/sorgtsqr.f b/lapack-netlib/SRC/sorgtsqr.f index 692eba1d9..0be27af77 100644 --- a/lapack-netlib/SRC/sorgtsqr.f +++ b/lapack-netlib/SRC/sorgtsqr.f @@ -157,7 +157,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup singleOTHERcomputational +*> \ingroup ungtsqr * *> \par Contributors: * ================== @@ -196,11 +196,15 @@ LOGICAL LQUERY INTEGER IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SCOPY, SLAMTSQR, SLASET, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -257,14 +261,14 @@ CALL XERBLA( 'SORGTSQR', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -297,7 +301,7 @@ CALL SCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 ) END DO * - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of SORGTSQR diff --git a/lapack-netlib/SRC/sorgtsqr_row.f b/lapack-netlib/SRC/sorgtsqr_row.f index d2a2150cd..5a1e1ff07 100644 --- a/lapack-netlib/SRC/sorgtsqr_row.f +++ b/lapack-netlib/SRC/sorgtsqr_row.f @@ -169,7 +169,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup sigleOTHERcomputational +*> \ingroup ungtsqr_row * *> \par Contributors: * ================== @@ -213,11 +213,15 @@ * .. Local Arrays .. REAL DUMMY( 1, 1 ) * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SLARFB_GETT, SLASET, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -255,14 +259,14 @@ CALL XERBLA( 'SORGTSQR_ROW', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -371,7 +375,7 @@ * END DO * - WORK( 1 ) = REAL( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of SORGTSQR_ROW diff --git a/lapack-netlib/SRC/sorm22.f b/lapack-netlib/SRC/sorm22.f index 15096870a..886adb2cf 100644 --- a/lapack-netlib/SRC/sorm22.f +++ b/lapack-netlib/SRC/sorm22.f @@ -155,7 +155,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unm22 * * ===================================================================== SUBROUTINE SORM22( SIDE, TRANS, M, N, N1, N2, Q, LDQ, C, LDC, @@ -187,13 +187,14 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, STRMM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC REAL, MAX, MIN + INTRINSIC MAX, MIN * .. * .. Executable Statements .. * @@ -237,7 +238,7 @@ * IF( INFO.EQ.0 ) THEN LWKOPT = M*N - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -430,7 +431,7 @@ END IF END IF * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SORM22 diff --git a/lapack-netlib/SRC/sormbr.f b/lapack-netlib/SRC/sormbr.f index efe5be41a..e2dccc363 100644 --- a/lapack-netlib/SRC/sormbr.f +++ b/lapack-netlib/SRC/sormbr.f @@ -188,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmbr * * ===================================================================== SUBROUTINE SORMBR( VECT, SIDE, TRANS, M, N, K, A, LDA, TAU, C, @@ -217,7 +217,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SORMLQ, SORMQR, XERBLA @@ -285,7 +286,7 @@ END IF END IF LWKOPT = NW*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -363,7 +364,7 @@ $ TAU, C( I1, I2 ), LDC, WORK, LWORK, IINFO ) END IF END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMBR diff --git a/lapack-netlib/SRC/sormhr.f b/lapack-netlib/SRC/sormhr.f index 2d2053af4..e033feb38 100644 --- a/lapack-netlib/SRC/sormhr.f +++ b/lapack-netlib/SRC/sormhr.f @@ -171,7 +171,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmhr * * ===================================================================== SUBROUTINE SORMHR( SIDE, TRANS, M, N, ILO, IHI, A, LDA, TAU, C, @@ -199,7 +199,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SORMQR, XERBLA @@ -253,7 +254,7 @@ NB = ILAENV( 1, 'SORMQR', SIDE // TRANS, M, NH, NH, -1 ) END IF LWKOPT = NW*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -285,7 +286,7 @@ CALL SORMQR( SIDE, TRANS, MI, NI, NH, A( ILO+1, ILO ), LDA, $ TAU( ILO ), C( I1, I2 ), LDC, WORK, LWORK, IINFO ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMHR diff --git a/lapack-netlib/SRC/sormlq.f b/lapack-netlib/SRC/sormlq.f index ee996e560..1a32568b6 100644 --- a/lapack-netlib/SRC/sormlq.f +++ b/lapack-netlib/SRC/sormlq.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmlq * * ===================================================================== SUBROUTINE SORMLQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -195,7 +195,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLARFB, SLARFT, SORML2, XERBLA @@ -246,7 +247,7 @@ NB = MIN( NBMAX, ILAENV( 1, 'SORMLQ', SIDE // TRANS, M, N, K, $ -1 ) ) LWKOPT = NW*NB + TSIZE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -338,7 +339,7 @@ $ C( IC, JC ), LDC, WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMLQ diff --git a/lapack-netlib/SRC/sormql.f b/lapack-netlib/SRC/sormql.f index 72a8d22ee..9564d4141 100644 --- a/lapack-netlib/SRC/sormql.f +++ b/lapack-netlib/SRC/sormql.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmql * * ===================================================================== SUBROUTINE SORMQL( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -194,7 +194,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLARFB, SLARFT, SORM2L, XERBLA @@ -249,7 +250,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -330,7 +331,7 @@ $ WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMQL diff --git a/lapack-netlib/SRC/sormqr.f b/lapack-netlib/SRC/sormqr.f index 5d4256f09..adb1203df 100644 --- a/lapack-netlib/SRC/sormqr.f +++ b/lapack-netlib/SRC/sormqr.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmqr * * ===================================================================== SUBROUTINE SORMQR( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -194,7 +194,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLARFB, SLARFT, SORM2R, XERBLA @@ -245,7 +246,7 @@ NB = MIN( NBMAX, ILAENV( 1, 'SORMQR', SIDE // TRANS, M, N, K, $ -1 ) ) LWKOPT = NW*NB + TSIZE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -331,7 +332,7 @@ $ C( IC, JC ), LDC, WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMQR diff --git a/lapack-netlib/SRC/sormrq.f b/lapack-netlib/SRC/sormrq.f index 62fcdacdb..f091f0507 100644 --- a/lapack-netlib/SRC/sormrq.f +++ b/lapack-netlib/SRC/sormrq.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmrq * * ===================================================================== SUBROUTINE SORMRQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, @@ -195,7 +195,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLARFB, SLARFT, SORMR2, XERBLA @@ -250,7 +251,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -337,7 +338,7 @@ $ WORK, LDWORK ) 10 CONTINUE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMRQ diff --git a/lapack-netlib/SRC/sormrz.f b/lapack-netlib/SRC/sormrz.f index cdadd62b5..b037a984b 100644 --- a/lapack-netlib/SRC/sormrz.f +++ b/lapack-netlib/SRC/sormrz.f @@ -168,7 +168,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmrz * *> \par Contributors: * ================== @@ -213,7 +213,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLARZB, SLARZT, SORMR3, XERBLA @@ -271,7 +272,7 @@ $ K, -1 ) ) LWKOPT = NW*NB + TSIZE END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -367,7 +368,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/sormtr.f b/lapack-netlib/SRC/sormtr.f index 3ba749fee..1bc87768f 100644 --- a/lapack-netlib/SRC/sormtr.f +++ b/lapack-netlib/SRC/sormtr.f @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unmtr * * ===================================================================== SUBROUTINE SORMTR( SIDE, UPLO, TRANS, M, N, A, LDA, TAU, C, LDC, @@ -192,7 +192,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SORMQL, SORMQR, XERBLA @@ -256,7 +257,7 @@ END IF END IF LWKOPT = NW*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -301,7 +302,7 @@ CALL SORMQR( SIDE, TRANS, MI, NI, NQ-1, A( 2, 1 ), LDA, TAU, $ C( I1, I2 ), LDC, WORK, LWORK, IINFO ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SORMTR diff --git a/lapack-netlib/SRC/ssbev_2stage.f b/lapack-netlib/SRC/ssbev_2stage.f index 5752c1ecc..71ace4e27 100644 --- a/lapack-netlib/SRC/ssbev_2stage.f +++ b/lapack-netlib/SRC/ssbev_2stage.f @@ -131,7 +131,7 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = (2KD+1)*N + KD*NTHREADS + N @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hbev_2stage * *> \par Further Details: * ===================== @@ -182,7 +182,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -190,11 +190,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -232,12 +232,13 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, SLANSB - EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE + REAL SLAMCH, SLANSB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASCL, SSCAL, SSTEQR, SSTERF, XERBLA, - $ SSYTRD_SB2ST + $ SSYTRD_SB2ST * .. * .. Intrinsic Functions .. INTRINSIC SQRT @@ -268,7 +269,7 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', JOBZ, $ N, KD, -1, -1 ) @@ -277,7 +278,7 @@ LWTRD = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', JOBZ, $ N, KD, IB, -1 ) LWMIN = N + LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ENDIF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) @@ -343,7 +344,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, - $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call SSTERF. For eigenvectors, call SSTEQR. @@ -368,7 +369,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ssbevd.f b/lapack-netlib/SRC/ssbevd.f index e87f9a030..e4118dbed 100644 --- a/lapack-netlib/SRC/ssbevd.f +++ b/lapack-netlib/SRC/ssbevd.f @@ -179,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hbevd * * ===================================================================== SUBROUTINE SSBEVD( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, WORK, @@ -213,8 +213,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANSB - EXTERNAL LSAME, SLAMCH, SLANSB + REAL SLAMCH, SLANSB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSB, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, SLASCL, SSBTRD, SSCAL, SSTEDC, @@ -259,7 +259,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -342,7 +342,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/ssbevd_2stage.f b/lapack-netlib/SRC/ssbevd_2stage.f index 014bade48..de3f1c010 100644 --- a/lapack-netlib/SRC/ssbevd_2stage.f +++ b/lapack-netlib/SRC/ssbevd_2stage.f @@ -134,7 +134,7 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = (2KD+1)*N + KD*NTHREADS + N @@ -188,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hbevd_2stage * *> \par Further Details: * ===================== @@ -206,7 +206,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -214,11 +214,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -258,8 +258,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, SLANSB - EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE + REAL SLAMCH, SLANSB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, SLASCL, SSCAL, SSTEDC, @@ -307,7 +308,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -374,7 +375,7 @@ LLWRK2 = LWORK - INDWK2 + 1 * CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W, - $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call SSTERF. For eigenvectors, call SSTEDC. @@ -394,7 +395,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/ssbevx_2stage.f b/lapack-netlib/SRC/ssbevx_2stage.f index 224b676b7..d25d3639a 100644 --- a/lapack-netlib/SRC/ssbevx_2stage.f +++ b/lapack-netlib/SRC/ssbevx_2stage.f @@ -235,7 +235,7 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 7*N, dimension) where *> dimension = (2KD+1)*N + KD*NTHREADS + 2*N @@ -281,7 +281,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hbevx_2stage * *> \par Further Details: * ===================== @@ -299,7 +299,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -307,11 +307,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -357,8 +357,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, SLANSB - EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE + REAL SLAMCH, SLANSB, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSB, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SGEMV, SLACPY, SLASCL, SSCAL, @@ -414,7 +415,7 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', JOBZ, $ N, KD, -1, -1 ) @@ -423,7 +424,7 @@ LWTRD = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', JOBZ, $ N, KD, IB, -1 ) LWMIN = 2*N + LHTRD + LWTRD - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ENDIF * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) @@ -513,7 +514,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, WORK( INDD ), - $ WORK( INDE ), WORK( INDHOUS ), LHTRD, + $ WORK( INDE ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * If all eigenvalues are desired and ABSTOL is less than or equal @@ -624,7 +625,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ssbgvd.f b/lapack-netlib/SRC/ssbgvd.f index 7c21ee455..f872e5464 100644 --- a/lapack-netlib/SRC/ssbgvd.f +++ b/lapack-netlib/SRC/ssbgvd.f @@ -208,7 +208,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hbgvd * *> \par Contributors: * ================== @@ -247,7 +247,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, SPBSTF, SSBGST, SSBTRD, SSTEDC, @@ -292,7 +293,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -353,7 +354,7 @@ CALL SLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ ) END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/sspevd.f b/lapack-netlib/SRC/sspevd.f index 0872e95ac..1aae48d1d 100644 --- a/lapack-netlib/SRC/sspevd.f +++ b/lapack-netlib/SRC/sspevd.f @@ -164,7 +164,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hpevd * * ===================================================================== SUBROUTINE SSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, @@ -198,8 +198,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANSP - EXTERNAL LSAME, SLAMCH, SLANSP + REAL SLAMCH, SLANSP, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSP, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SOPMTR, SSCAL, SSPTRD, SSTEDC, SSTERF, XERBLA @@ -240,7 +240,7 @@ END IF END IF IWORK( 1 ) = LIWMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -9 @@ -319,7 +319,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f index 1a88365f2..c1e14594b 100644 --- a/lapack-netlib/SRC/sspgvd.f +++ b/lapack-netlib/SRC/sspgvd.f @@ -191,7 +191,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup hpgvd * *> \par Contributors: * ================== @@ -225,7 +225,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SPPTRF, SSPEVD, SSPGST, STPMV, STPSV, XERBLA @@ -267,7 +268,7 @@ LWMIN = 2*N END IF END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -11 @@ -345,7 +346,7 @@ END IF END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/sstedc.f b/lapack-netlib/SRC/sstedc.f index 61e3c2fda..0e1cb4258 100644 --- a/lapack-netlib/SRC/sstedc.f +++ b/lapack-netlib/SRC/sstedc.f @@ -167,7 +167,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup auxOTHERcomputational +*> \ingroup stedc * *> \par Contributors: * ================== @@ -208,8 +208,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANST - EXTERNAL ILAENV, LSAME, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, SLAED0, SLASCL, SLASET, SLASRT, @@ -268,7 +268,7 @@ LIWMIN = 3 + 5*N END IF END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT. LQUERY ) THEN @@ -463,7 +463,7 @@ END IF * 50 CONTINUE - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/sstemr.f b/lapack-netlib/SRC/sstemr.f index 2ed697b69..62cfa3d4d 100644 --- a/lapack-netlib/SRC/sstemr.f +++ b/lapack-netlib/SRC/sstemr.f @@ -359,8 +359,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANST - EXTERNAL LSAME, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SLAE2, SLAEV2, SLARRC, SLARRE, SLARRJ, @@ -443,7 +443,7 @@ RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) ) * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( WANTZ .AND. ALLEIG ) THEN @@ -782,7 +782,7 @@ ENDIF * * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/sstevd.f b/lapack-netlib/SRC/sstevd.f index 218af8c76..4fc2a6311 100644 --- a/lapack-netlib/SRC/sstevd.f +++ b/lapack-netlib/SRC/sstevd.f @@ -149,7 +149,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup stevd * * ===================================================================== SUBROUTINE SSTEVD( JOBZ, N, D, E, Z, LDZ, WORK, LWORK, IWORK, @@ -182,8 +182,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLAMCH, SLANST - EXTERNAL LSAME, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSCAL, SSTEDC, SSTERF, XERBLA @@ -215,7 +215,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -283,7 +283,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, D, 1 ) * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/sstevr.f b/lapack-netlib/SRC/sstevr.f index 2ab63eb3e..42f49b11b 100644 --- a/lapack-netlib/SRC/sstevr.f +++ b/lapack-netlib/SRC/sstevr.f @@ -287,7 +287,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHEReigen +*> \ingroup stevr * *> \par Contributors: * ================== @@ -336,8 +336,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANST - EXTERNAL LSAME, ILAENV, SLAMCH, SLANST + REAL SLAMCH, SLANST, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANST, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTEMR, SSTEIN, SSTERF, @@ -389,7 +389,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -570,7 +570,7 @@ * IF (wantz .and. INDEIG ) Z( 1,1) = Z(1,1) / 1.002 + .002 * * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN RETURN * diff --git a/lapack-netlib/SRC/ssyev.f b/lapack-netlib/SRC/ssyev.f index 03ed326b8..638445f04 100644 --- a/lapack-netlib/SRC/ssyev.f +++ b/lapack-netlib/SRC/ssyev.f @@ -125,7 +125,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heev * * ===================================================================== SUBROUTINE SSYEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO ) @@ -158,8 +158,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANSY - EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF, SSYTRD, @@ -190,7 +190,7 @@ IF( INFO.EQ.0 ) THEN NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, ( NB+2 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.MAX( 1, 3*N-1 ) .AND. .NOT.LQUERY ) $ INFO = -8 @@ -274,7 +274,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssyev_2stage.f b/lapack-netlib/SRC/ssyev_2stage.f index a6fa30cc8..519ee334d 100644 --- a/lapack-netlib/SRC/ssyev_2stage.f +++ b/lapack-netlib/SRC/ssyev_2stage.f @@ -20,7 +20,7 @@ * Definition: * =========== * -* SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, * INFO ) * * IMPLICIT NONE @@ -105,12 +105,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 2*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 2*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -143,7 +143,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heev_2stage * *> \par Further Details: * ===================== @@ -161,7 +161,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -169,16 +169,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * * ===================================================================== - SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, $ INFO ) * IMPLICIT NONE @@ -211,8 +211,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, SLANSY - EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV2STAGE + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF, @@ -305,7 +306,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), - $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call SSTERF. For eigenvectors, first call @@ -336,7 +337,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f index ee0e33384..a5e4638d6 100644 --- a/lapack-netlib/SRC/ssyevd.f +++ b/lapack-netlib/SRC/ssyevd.f @@ -160,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heevd * *> \par Contributors: * ================== @@ -204,8 +204,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANSY - EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLACPY, SLASCL, SORMTR, SSCAL, SSTEDC, SSTERF, @@ -251,7 +251,7 @@ $ N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -335,7 +335,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/ssyevr.f b/lapack-netlib/SRC/ssyevr.f index d8e4ce3ea..47e4d7cbf 100644 --- a/lapack-netlib/SRC/ssyevr.f +++ b/lapack-netlib/SRC/ssyevr.f @@ -317,7 +317,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -368,8 +368,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANSY - EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN, @@ -428,7 +428,7 @@ NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -677,7 +677,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevr_2stage.f b/lapack-netlib/SRC/ssyevr_2stage.f index 8ab2844c6..a2d6a6231 100644 --- a/lapack-netlib/SRC/ssyevr_2stage.f +++ b/lapack-netlib/SRC/ssyevr_2stage.f @@ -263,7 +263,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of SSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the orthogonal transformations applied by SORMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -277,12 +277,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 5*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -330,7 +330,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -358,7 +358,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -366,11 +366,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -416,8 +416,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ILAENV2STAGE - REAL SLAMCH, SLANSY - EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV, ILAENV2STAGE + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK, ILAENV, + $ ILAENV2STAGE * .. * .. External Subroutines .. EXTERNAL SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN, @@ -484,7 +485,7 @@ * NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN END IF * @@ -608,7 +609,7 @@ * Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. * * - CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), $ LHTRD, WORK( INDWK ), LLWORK, IINFO ) * @@ -732,7 +733,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevx.f b/lapack-netlib/SRC/ssyevx.f index 11776e8c5..2204aa39b 100644 --- a/lapack-netlib/SRC/ssyevx.f +++ b/lapack-netlib/SRC/ssyevx.f @@ -244,7 +244,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heevx * * ===================================================================== SUBROUTINE SSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU, @@ -285,8 +285,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANSY - EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, SLANSY, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ, @@ -338,13 +338,13 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + WORK( 1 ) = SROUNDUP_LWORK(LWKMIN) ELSE LWKMIN = 8*N NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) @@ -542,7 +542,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssyevx_2stage.f b/lapack-netlib/SRC/ssyevx_2stage.f index 1a2225c87..a8585e5f7 100644 --- a/lapack-netlib/SRC/ssyevx_2stage.f +++ b/lapack-netlib/SRC/ssyevx_2stage.f @@ -208,12 +208,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 8*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 3*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 3*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -259,7 +259,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup heevx_2stage * *> \par Further Details: * ===================== @@ -277,7 +277,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -285,11 +285,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -334,8 +334,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - REAL SLAMCH, SLANSY - EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV2STAGE + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, SLANSY, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ, @@ -388,7 +389,7 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWMIN = 1 - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) ELSE KD = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', JOBZ, $ N, -1, -1, -1 ) @@ -487,7 +488,7 @@ INDWRK = INDHOUS + LHTRD LLWORK = LWORK - INDWRK + 1 * - CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), $ LHTRD, WORK( INDWRK ), LLWORK, IINFO ) * @@ -600,7 +601,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/ssygv.f b/lapack-netlib/SRC/ssygv.f index f39947d92..3a79f5431 100644 --- a/lapack-netlib/SRC/ssygv.f +++ b/lapack-netlib/SRC/ssygv.f @@ -167,7 +167,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup hegv * * ===================================================================== SUBROUTINE SSYGV( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK, @@ -199,7 +199,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SPOTRF, SSYEV, SSYGST, STRMM, STRSM, XERBLA @@ -234,7 +235,7 @@ LWKMIN = MAX( 1, 3*N - 1 ) NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKMIN, ( NB + 2 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -11 @@ -303,7 +304,7 @@ END IF END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SSYGV diff --git a/lapack-netlib/SRC/ssygv_2stage.f b/lapack-netlib/SRC/ssygv_2stage.f index 3d9a44b5e..8719d8c7a 100644 --- a/lapack-netlib/SRC/ssygv_2stage.f +++ b/lapack-netlib/SRC/ssygv_2stage.f @@ -143,12 +143,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 2*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 2*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -186,7 +186,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup hegv_2stage * *> \par Further Details: * ===================== @@ -204,7 +204,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -212,11 +212,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -252,7 +252,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SPOTRF, SSYGST, STRMM, STRSM, XERBLA, @@ -359,7 +360,7 @@ END IF END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of SSYGV_2STAGE diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f index 3c8bd2a0e..a90d1afb7 100644 --- a/lapack-netlib/SRC/ssygvd.f +++ b/lapack-netlib/SRC/ssygvd.f @@ -197,7 +197,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup hegvd * *> \par Further Details: * ===================== @@ -245,7 +245,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SPOTRF, SSYEVD, SSYGST, STRMM, STRSM, XERBLA @@ -289,7 +290,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -361,7 +362,7 @@ END IF END IF * - WORK( 1 ) = LOPT + WORK( 1 ) = SROUNDUP_LWORK(LOPT) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/ssygvx.f b/lapack-netlib/SRC/ssygvx.f index 344075c9f..16adefa22 100644 --- a/lapack-netlib/SRC/ssygvx.f +++ b/lapack-netlib/SRC/ssygvx.f @@ -283,7 +283,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYeigen +*> \ingroup hegvx * *> \par Contributors: * ================== @@ -324,7 +324,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SPOTRF, SSYEVX, SSYGST, STRMM, STRSM, XERBLA @@ -380,7 +381,7 @@ LWKMIN = MAX( 1, 8*N ) NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -20 @@ -453,7 +454,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssysv.f b/lapack-netlib/SRC/ssysv.f index 06a42dfb7..523ea66c1 100644 --- a/lapack-netlib/SRC/ssysv.f +++ b/lapack-netlib/SRC/ssysv.f @@ -163,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYsolve +*> \ingroup hesv * * ===================================================================== SUBROUTINE SSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -190,7 +190,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRF, SSYTRS, SSYTRS2 @@ -225,7 +226,7 @@ CALL SSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -258,7 +259,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f index 5661332c5..e43d4de7f 100644 --- a/lapack-netlib/SRC/ssysv_aa.f +++ b/lapack-netlib/SRC/ssysv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE SSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -181,7 +181,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRS_AA, SSYTRF_AA @@ -216,7 +217,7 @@ $ -1, INFO ) LWKOPT_SYTRS = INT( WORK(1) ) LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -238,7 +239,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index aa862f14b..3d88e068e 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -178,7 +178,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE SSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -208,7 +208,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRF_AA_2STAGE, SSYTRS_AA_2STAGE, @@ -268,7 +269,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_rk.f b/lapack-netlib/SRC/ssysv_rk.f index 9a7dfa4bb..abf862d66 100644 --- a/lapack-netlib/SRC/ssysv_rk.f +++ b/lapack-netlib/SRC/ssysv_rk.f @@ -205,7 +205,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup singleSYsolve +*> \ingroup hesv_rk * *> \par Contributors: * ================== @@ -247,7 +247,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRF_RK, SSYTRS_3 @@ -282,7 +283,7 @@ CALL SSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -305,7 +306,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_rook.f b/lapack-netlib/SRC/ssysv_rook.f index fb7ba8c53..c5c77e562 100644 --- a/lapack-netlib/SRC/ssysv_rook.f +++ b/lapack-netlib/SRC/ssysv_rook.f @@ -181,7 +181,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYsolve +*> \ingroup hesv_rook * *> \par Contributors: * ================== @@ -223,7 +223,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRF_ROOK, SSYTRS_ROOK @@ -258,7 +259,7 @@ CALL SSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) LWKOPT = INT( WORK( 1 ) ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -281,7 +282,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssysvx.f b/lapack-netlib/SRC/ssysvx.f index b19ce2641..0d72217eb 100644 --- a/lapack-netlib/SRC/ssysvx.f +++ b/lapack-netlib/SRC/ssysvx.f @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE SSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -311,8 +311,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, SLANSY - EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY + REAL SLAMCH, SLANSY, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLACPY, SSYCON, SSYRFS, SSYTRF, SSYTRS, XERBLA @@ -354,7 +354,7 @@ NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -404,7 +404,7 @@ IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) $ INFO = N + 1 * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssytrd.f b/lapack-netlib/SRC/ssytrd.f index f09ad9ab4..f4fbecdc9 100644 --- a/lapack-netlib/SRC/ssytrd.f +++ b/lapack-netlib/SRC/ssytrd.f @@ -139,7 +139,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrd * *> \par Further Details: * ===================== @@ -223,7 +223,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -248,7 +249,7 @@ * NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) LWKOPT = N*NB - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -365,7 +366,7 @@ $ TAU( I ), IINFO ) END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SSYTRD diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index b8386670a..32bae26dc 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the ssytrd_sy2sb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the ssytrd_sy2sb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the ssytrd_sy2sb *> routine has been called to produce AB (e.g., AB is *> the output of ssytrd_sy2sb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -147,7 +147,7 @@ *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK @@ -188,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup real16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +208,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +216,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * #if defined(_OPENMP) @@ -258,11 +258,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SISEV, SIZETAU, LDV, LHMIN, LWMIN * .. @@ -274,8 +274,9 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + INTEGER ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -315,7 +316,7 @@ * IF( INFO.EQ.0 ) THEN HOUS( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) END IF * IF( INFO.NE.0 ) THEN @@ -355,7 +356,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -363,11 +364,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* real because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* real because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -382,17 +383,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = ( AB( ABDPOS, I ) ) @@ -413,7 +414,7 @@ RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the symmetric band of A to a tridiagonal matrix. * THGRSIZ = N @@ -422,7 +423,7 @@ NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL SLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL SLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -431,7 +432,7 @@ * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -440,7 +441,7 @@ #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -451,7 +452,7 @@ ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -477,16 +478,16 @@ ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -494,20 +495,20 @@ !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW ) ) -#endif +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -522,14 +523,14 @@ !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = ( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -544,10 +545,10 @@ ENDIF * HOUS( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of SSYTRD_SB2ST * END - + diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f index 2c92cd14a..4efc43630 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.f +++ b/lapack-netlib/SRC/ssytrd_sy2sb.f @@ -158,7 +158,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -283,7 +283,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -313,7 +314,7 @@ CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN END IF * @@ -506,7 +507,7 @@ END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of SSYTRD_SY2SB diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f index 31e38e466..a788fbcf0 100644 --- a/lapack-netlib/SRC/ssytrf.f +++ b/lapack-netlib/SRC/ssytrf.f @@ -135,7 +135,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -202,7 +202,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASYF, SSYTF2, XERBLA @@ -233,7 +234,7 @@ * NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -352,7 +353,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SSYTRF diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f index 4ba026fc8..d6408a978 100644 --- a/lapack-netlib/SRC/ssytrf_aa.f +++ b/lapack-netlib/SRC/ssytrf_aa.f @@ -125,7 +125,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrf_aa * * ===================================================================== SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) @@ -159,7 +159,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASYF_AA, SGEMV, SSCAL, SCOPY, SSWAP, SGEMM, @@ -191,7 +192,7 @@ * IF( INFO.EQ.0 ) THEN LWKOPT = (NB+1)*N - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -457,7 +458,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SSYTRF_AA diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index 07357f2ab..abe6564c5 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -187,7 +187,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SCOPY, SLACPY, @@ -230,7 +231,7 @@ TB( 1 ) = (3*NB+1)*N END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK(N*NB) END IF END IF IF( TQUERY .OR. WQUERY ) THEN diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f index 8e1ef460a..72830543c 100644 --- a/lapack-netlib/SRC/ssytrf_rk.f +++ b/lapack-netlib/SRC/ssytrf_rk.f @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup singleSYcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -280,7 +280,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASYF_RK, SSYTF2_RK, SSWAP, XERBLA @@ -311,7 +312,7 @@ * NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -487,7 +488,7 @@ * END IF * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SSYTRF_RK diff --git a/lapack-netlib/SRC/ssytrf_rook.f b/lapack-netlib/SRC/ssytrf_rook.f index 653289e2b..339a229e7 100644 --- a/lapack-netlib/SRC/ssytrf_rook.f +++ b/lapack-netlib/SRC/ssytrf_rook.f @@ -146,7 +146,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== @@ -228,7 +228,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLASYF_ROOK, SSYTF2_ROOK, XERBLA @@ -259,7 +260,7 @@ * NB = ILAENV( 1, 'SSYTRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF * IF( INFO.NE.0 ) THEN @@ -382,7 +383,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN * * End of SSYTRF_ROOK diff --git a/lapack-netlib/SRC/ssytri_3.f b/lapack-netlib/SRC/ssytri_3.f index 58d5df92a..bca01105d 100644 --- a/lapack-netlib/SRC/ssytri_3.f +++ b/lapack-netlib/SRC/ssytri_3.f @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup singleSYcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -190,7 +190,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRI_3X, XERBLA @@ -225,7 +226,7 @@ CALL XERBLA( 'SSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -236,7 +237,7 @@ * CALL SSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f index 3cfa2a206..12fca0c71 100644 --- a/lapack-netlib/SRC/ssytrs_aa.f +++ b/lapack-netlib/SRC/ssytrs_aa.f @@ -123,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -155,7 +155,8 @@ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGTSV, SSWAP, SLACPY, STRSM, XERBLA @@ -186,7 +187,7 @@ RETURN ELSE IF( LQUERY ) THEN LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * diff --git a/lapack-netlib/SRC/stgexc.f b/lapack-netlib/SRC/stgexc.f index d1ad79936..d68eb5fc7 100644 --- a/lapack-netlib/SRC/stgexc.f +++ b/lapack-netlib/SRC/stgexc.f @@ -195,7 +195,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup tgexc * *> \par Contributors: * ================== @@ -241,6 +241,10 @@ LOGICAL LQUERY INTEGER HERE, LWMIN, NBF, NBL, NBNEXT * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL STGEX2, XERBLA * .. @@ -533,7 +537,7 @@ $ GO TO 20 END IF ILST = HERE - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of STGEXC diff --git a/lapack-netlib/SRC/stgsen.f b/lapack-netlib/SRC/stgsen.f index f1103d740..ac9c4677a 100644 --- a/lapack-netlib/SRC/stgsen.f +++ b/lapack-netlib/SRC/stgsen.f @@ -304,7 +304,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup tgsen * *> \par Further Details: * ===================== @@ -490,8 +490,8 @@ $ XERBLA * .. * .. External Functions .. - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SIGN, SQRT @@ -571,7 +571,7 @@ LIWMIN = 1 END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -852,7 +852,7 @@ END IF 70 CONTINUE * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/stgsna.f b/lapack-netlib/SRC/stgsna.f index 430f3c4b7..e8cb28b95 100644 --- a/lapack-netlib/SRC/stgsna.f +++ b/lapack-netlib/SRC/stgsna.f @@ -230,7 +230,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup tgsna * *> \par Further Details: * ===================== @@ -416,8 +416,9 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SDOT, SLAMCH, SLAPY2, SNRM2 - EXTERNAL LSAME, SDOT, SLAMCH, SLAPY2, SNRM2 + REAL SDOT, SLAMCH, SLAPY2, SNRM2, SROUNDUP_LWORK + EXTERNAL LSAME, SDOT, SLAMCH, SLAPY2, SNRM2, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMV, SLACPY, SLAG2, STGEXC, STGSYL, XERBLA @@ -490,7 +491,7 @@ ELSE LWMIN = N END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( MM.LT.M ) THEN INFO = -15 @@ -689,7 +690,7 @@ $ KS = KS + 1 * 20 CONTINUE - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) RETURN * * End of STGSNA diff --git a/lapack-netlib/SRC/stgsyl.f b/lapack-netlib/SRC/stgsyl.f index 733c8ab9c..07a82e380 100644 --- a/lapack-netlib/SRC/stgsyl.f +++ b/lapack-netlib/SRC/stgsyl.f @@ -261,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup tgsyl * *> \par Contributors: * ================== @@ -331,7 +331,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SLACPY, SLASET, SSCAL, STGSY2, XERBLA @@ -384,7 +385,7 @@ ELSE LWMIN = 1 END IF - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -20 @@ -670,7 +671,7 @@ * END IF * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) * RETURN * diff --git a/lapack-netlib/SRC/strsen.f b/lapack-netlib/SRC/strsen.f index c0f75fb12..f7a05ae8b 100644 --- a/lapack-netlib/SRC/strsen.f +++ b/lapack-netlib/SRC/strsen.f @@ -231,7 +231,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup trsen * *> \par Further Details: * ===================== @@ -346,8 +346,8 @@ * .. * .. External Functions .. LOGICAL LSAME - REAL SLANGE - EXTERNAL LSAME, SLANGE + REAL SLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, SLANGE, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLACN2, SLACPY, STREXC, STRSYL, XERBLA @@ -427,7 +427,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN END IF * @@ -558,7 +558,7 @@ END IF 60 CONTINUE * - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK(LWMIN) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/stzrzf.f b/lapack-netlib/SRC/stzrzf.f index e8cbb56b6..516bea5d4 100644 --- a/lapack-netlib/SRC/stzrzf.f +++ b/lapack-netlib/SRC/stzrzf.f @@ -116,7 +116,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup tzrzf * *> \par Contributors: * ================== @@ -179,7 +179,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -207,7 +208,7 @@ LWKOPT = M*NB LWKMIN = MAX( 1, M ) END IF - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 @@ -301,7 +302,7 @@ IF( MU.GT.0 ) $ CALL SLATRZ( MU, N, N-M, A, LDA, TAU, WORK ) * - WORK( 1 ) = LWKOPT + WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * RETURN * From 3d38da2bc401d56162a528547430b0b69664ea38 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 16:50:52 +0100 Subject: [PATCH 407/718] Make vector orthogonalization more reliable (Reference-LAPACK PR 930) --- lapack-netlib/SRC/clarfgp.f | 7 +++--- lapack-netlib/SRC/cunbdb5.f | 50 ++++++++++++++++++++++++++----------- lapack-netlib/SRC/cunbdb6.f | 21 ++++++++-------- lapack-netlib/SRC/dlarfgp.f | 9 ++++--- lapack-netlib/SRC/dorbdb5.f | 50 ++++++++++++++++++++++++++----------- lapack-netlib/SRC/dorbdb6.f | 21 ++++++++-------- lapack-netlib/SRC/slarfgp.f | 7 +++--- lapack-netlib/SRC/sorbdb5.f | 50 ++++++++++++++++++++++++++----------- lapack-netlib/SRC/sorbdb6.f | 21 ++++++++-------- lapack-netlib/SRC/zlarfgp.f | 7 +++--- lapack-netlib/SRC/zunbdb5.f | 50 ++++++++++++++++++++++++++----------- lapack-netlib/SRC/zunbdb6.f | 21 ++++++++-------- 12 files changed, 201 insertions(+), 113 deletions(-) diff --git a/lapack-netlib/SRC/clarfgp.f b/lapack-netlib/SRC/clarfgp.f index b584484c7..47b5e47b0 100644 --- a/lapack-netlib/SRC/clarfgp.f +++ b/lapack-netlib/SRC/clarfgp.f @@ -97,7 +97,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERauxiliary +*> \ingroup larfgp * * ===================================================================== SUBROUTINE CLARFGP( N, ALPHA, X, INCX, TAU ) @@ -122,7 +122,7 @@ * .. * .. Local Scalars .. INTEGER J, KNT - REAL ALPHI, ALPHR, BETA, BIGNUM, SMLNUM, XNORM + REAL ALPHI, ALPHR, BETA, BIGNUM, EPS, SMLNUM, XNORM COMPLEX SAVEALPHA * .. * .. External Functions .. @@ -143,11 +143,12 @@ RETURN END IF * + EPS = SLAMCH( 'Precision' ) XNORM = SCNRM2( N-1, X, INCX ) ALPHR = REAL( ALPHA ) ALPHI = AIMAG( ALPHA ) * - IF( XNORM.EQ.ZERO ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * diff --git a/lapack-netlib/SRC/cunbdb5.f b/lapack-netlib/SRC/cunbdb5.f index d2ff4e700..22513cf8b 100644 --- a/lapack-netlib/SRC/cunbdb5.f +++ b/lapack-netlib/SRC/cunbdb5.f @@ -148,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup unbdb5 * * ===================================================================== SUBROUTINE CUNBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -169,18 +169,21 @@ * ===================================================================== * * .. Parameters .. + REAL REALZERO + PARAMETER ( REALZERO = 0.0E0 ) COMPLEX ONE, ZERO PARAMETER ( ONE = (1.0E0,0.0E0), ZERO = (0.0E0,0.0E0) ) * .. * .. Local Scalars .. INTEGER CHILDINFO, I, J + REAL EPS, NORM, SCL, SSQ * .. * .. External Subroutines .. - EXTERNAL CUNBDB6, XERBLA + EXTERNAL CLASSQ, CUNBDB6, CSCAL, XERBLA * .. * .. External Functions .. - REAL SCNRM2 - EXTERNAL SCNRM2 + REAL SLAMCH, SCNRM2 + EXTERNAL SLAMCH, SCNRM2 * .. * .. Intrinsic Function .. INTRINSIC MAX @@ -213,16 +216,33 @@ RETURN END IF * -* Project X onto the orthogonal complement of Q + EPS = SLAMCH( 'Precision' ) * - CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2, - $ WORK, LWORK, CHILDINFO ) +* Project X onto the orthogonal complement of Q if X is nonzero * -* If the projection is nonzero, then return + SCL = REALZERO + SSQ = REALZERO + CALL CLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL CLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) * - IF( SCNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN - RETURN + IF( NORM .GT. N * EPS ) THEN +* Scale vector to unit norm to avoid problems in the caller code. +* Computing the reciprocal is undesirable but +* * xLASCL cannot be used because of the vector increments and +* * the round-off error has a negligible impact on +* orthogonalization. + CALL CSCAL( M1, ONE / NORM, X1, INCX1 ) + CALL CSCAL( M2, ONE / NORM, X2, INCX2 ) + CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, + $ LDQ2, WORK, LWORK, CHILDINFO ) +* +* If the projection is nonzero, then return +* + IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN + RETURN + END IF END IF * * Project each standard basis vector e_1,...,e_M1 in turn, stopping @@ -238,8 +258,8 @@ END DO CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( SCNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO @@ -257,8 +277,8 @@ X2(I) = ONE CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( SCNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO diff --git a/lapack-netlib/SRC/cunbdb6.f b/lapack-netlib/SRC/cunbdb6.f index cd14d9295..566fd76b7 100644 --- a/lapack-netlib/SRC/cunbdb6.f +++ b/lapack-netlib/SRC/cunbdb6.f @@ -41,9 +41,8 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The Euclidean norm of X must be one and the columns of Q must be -*> orthonormal. The orthogonalized vector will be zero if and only if it -*> lies entirely in the range of Q. +*> The columns of Q must be orthonormal. The orthogonalized vector will +*> be zero if and only if it lies entirely in the range of Q. *> *> The projection is computed with at most two iterations of the *> classical Gram-Schmidt algorithm, see @@ -174,7 +173,7 @@ * * .. Parameters .. REAL ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.1E0, REALONE = 1.0E0, + PARAMETER ( ALPHA = 0.83E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) COMPLEX NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0E0,0.0E0), ONE = (1.0E0,0.0E0), @@ -223,14 +222,16 @@ * EPS = SLAMCH( 'Precision' ) * +* Compute the Euclidean norm of X +* + SCL = REALZERO + SSQ = REALZERO + CALL CLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL CLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) +* * First, project X onto the orthogonal complement of Q's column * space -* -* Christoph Conrads: In debugging mode the norm should be computed -* and an assertion added comparing the norm with one. Alas, Fortran -* never made it into 1989 when assert() was introduced into the C -* programming language. - NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N diff --git a/lapack-netlib/SRC/dlarfgp.f b/lapack-netlib/SRC/dlarfgp.f index 69845056d..a8cf1b31e 100644 --- a/lapack-netlib/SRC/dlarfgp.f +++ b/lapack-netlib/SRC/dlarfgp.f @@ -97,7 +97,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup larfgp * * ===================================================================== SUBROUTINE DLARFGP( N, ALPHA, X, INCX, TAU ) @@ -122,7 +122,7 @@ * .. * .. Local Scalars .. INTEGER J, KNT - DOUBLE PRECISION BETA, BIGNUM, SAVEALPHA, SMLNUM, XNORM + DOUBLE PRECISION BETA, BIGNUM, EPS, SAVEALPHA, SMLNUM, XNORM * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLAPY2, DNRM2 @@ -141,11 +141,12 @@ RETURN END IF * + EPS = DLAMCH( 'Precision' ) XNORM = DNRM2( N-1, X, INCX ) * - IF( XNORM.EQ.ZERO ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN * -* H = [+/-1, 0; I], sign chosen so ALPHA >= 0 +* H = [+/-1, 0; I], sign chosen so ALPHA >= 0. * IF( ALPHA.GE.ZERO ) THEN * When TAU.eq.ZERO, the vector is special-cased to be diff --git a/lapack-netlib/SRC/dorbdb5.f b/lapack-netlib/SRC/dorbdb5.f index 6e057a05f..cbd58ae54 100644 --- a/lapack-netlib/SRC/dorbdb5.f +++ b/lapack-netlib/SRC/dorbdb5.f @@ -148,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup unbdb5 * * ===================================================================== SUBROUTINE DORBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -169,18 +169,21 @@ * ===================================================================== * * .. Parameters .. + DOUBLE PRECISION REALZERO + PARAMETER ( REALZERO = 0.0D0 ) DOUBLE PRECISION ONE, ZERO PARAMETER ( ONE = 1.0D0, ZERO = 0.0D0 ) * .. * .. Local Scalars .. INTEGER CHILDINFO, I, J + DOUBLE PRECISION EPS, NORM, SCL, SSQ * .. * .. External Subroutines .. - EXTERNAL DORBDB6, XERBLA + EXTERNAL DLASSQ, DORBDB6, DSCAL, XERBLA * .. * .. External Functions .. - DOUBLE PRECISION DNRM2 - EXTERNAL DNRM2 + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DLAMCH, DNRM2 * .. * .. Intrinsic Function .. INTRINSIC MAX @@ -213,16 +216,33 @@ RETURN END IF * -* Project X onto the orthogonal complement of Q + EPS = DLAMCH( 'Precision' ) * - CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2, - $ WORK, LWORK, CHILDINFO ) +* Project X onto the orthogonal complement of Q if X is nonzero * -* If the projection is nonzero, then return + SCL = REALZERO + SSQ = REALZERO + CALL DLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL DLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) * - IF( DNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN - RETURN + IF( NORM .GT. N * EPS ) THEN +* Scale vector to unit norm to avoid problems in the caller code. +* Computing the reciprocal is undesirable but +* * xLASCL cannot be used because of the vector increments and +* * the round-off error has a negligible impact on +* orthogonalization. + CALL DSCAL( M1, ONE / NORM, X1, INCX1 ) + CALL DSCAL( M2, ONE / NORM, X2, INCX2 ) + CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, + $ LDQ2, WORK, LWORK, CHILDINFO ) +* +* If the projection is nonzero, then return +* + IF( DNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN + RETURN + END IF END IF * * Project each standard basis vector e_1,...,e_M1 in turn, stopping @@ -238,8 +258,8 @@ END DO CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( DNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( DNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO @@ -257,8 +277,8 @@ X2(I) = ONE CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( DNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( DNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO diff --git a/lapack-netlib/SRC/dorbdb6.f b/lapack-netlib/SRC/dorbdb6.f index 142887684..3e356d001 100644 --- a/lapack-netlib/SRC/dorbdb6.f +++ b/lapack-netlib/SRC/dorbdb6.f @@ -41,9 +41,8 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The Euclidean norm of X must be one and the columns of Q must be -*> orthonormal. The orthogonalized vector will be zero if and only if it -*> lies entirely in the range of Q. +*> The columns of Q must be orthonormal. The orthogonalized vector will +*> be zero if and only if it lies entirely in the range of Q. *> *> The projection is computed with at most two iterations of the *> classical Gram-Schmidt algorithm, see @@ -174,7 +173,7 @@ * * .. Parameters .. DOUBLE PRECISION ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.1D0, REALONE = 1.0D0, + PARAMETER ( ALPHA = 0.83D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) DOUBLE PRECISION NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0D0, ONE = 1.0D0, ZERO = 0.0D0 ) @@ -222,14 +221,16 @@ * EPS = DLAMCH( 'Precision' ) * +* Compute the Euclidean norm of X +* + SCL = REALZERO + SSQ = REALZERO + CALL DLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL DLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) +* * First, project X onto the orthogonal complement of Q's column * space -* -* Christoph Conrads: In debugging mode the norm should be computed -* and an assertion added comparing the norm with one. Alas, Fortran -* never made it into 1989 when assert() was introduced into the C -* programming language. - NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N diff --git a/lapack-netlib/SRC/slarfgp.f b/lapack-netlib/SRC/slarfgp.f index df42980c4..c28274c2c 100644 --- a/lapack-netlib/SRC/slarfgp.f +++ b/lapack-netlib/SRC/slarfgp.f @@ -97,7 +97,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERauxiliary +*> \ingroup larfgp * * ===================================================================== SUBROUTINE SLARFGP( N, ALPHA, X, INCX, TAU ) @@ -122,7 +122,7 @@ * .. * .. Local Scalars .. INTEGER J, KNT - REAL BETA, BIGNUM, SAVEALPHA, SMLNUM, XNORM + REAL BETA, BIGNUM, EPS, SAVEALPHA, SMLNUM, XNORM * .. * .. External Functions .. REAL SLAMCH, SLAPY2, SNRM2 @@ -141,9 +141,10 @@ RETURN END IF * + EPS = SLAMCH( 'Precision' ) XNORM = SNRM2( N-1, X, INCX ) * - IF( XNORM.EQ.ZERO ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN * * H = [+/-1, 0; I], sign chosen so ALPHA >= 0. * diff --git a/lapack-netlib/SRC/sorbdb5.f b/lapack-netlib/SRC/sorbdb5.f index 8c67aedfb..8fb88876f 100644 --- a/lapack-netlib/SRC/sorbdb5.f +++ b/lapack-netlib/SRC/sorbdb5.f @@ -148,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realOTHERcomputational +*> \ingroup unbdb5 * * ===================================================================== SUBROUTINE SORBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -169,18 +169,21 @@ * ===================================================================== * * .. Parameters .. + REAL REALZERO + PARAMETER ( REALZERO = 0.0E0 ) REAL ONE, ZERO PARAMETER ( ONE = 1.0E0, ZERO = 0.0E0 ) * .. * .. Local Scalars .. INTEGER CHILDINFO, I, J + REAL EPS, NORM, SCL, SSQ * .. * .. External Subroutines .. - EXTERNAL SORBDB6, XERBLA + EXTERNAL SLASSQ, SORBDB6, SSCAL, XERBLA * .. * .. External Functions .. - REAL SNRM2 - EXTERNAL SNRM2 + REAL SLAMCH, SNRM2 + EXTERNAL SLAMCH, SNRM2 * .. * .. Intrinsic Function .. INTRINSIC MAX @@ -213,16 +216,33 @@ RETURN END IF * -* Project X onto the orthogonal complement of Q + EPS = SLAMCH( 'Precision' ) * - CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2, - $ WORK, LWORK, CHILDINFO ) +* Project X onto the orthogonal complement of Q if X is nonzero * -* If the projection is nonzero, then return + SCL = REALZERO + SSQ = REALZERO + CALL SLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL SLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) * - IF( SNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN - RETURN + IF( NORM .GT. N * EPS ) THEN +* Scale vector to unit norm to avoid problems in the caller code. +* Computing the reciprocal is undesirable but +* * xLASCL cannot be used because of the vector increments and +* * the round-off error has a negligible impact on +* orthogonalization. + CALL SSCAL( M1, ONE / NORM, X1, INCX1 ) + CALL SSCAL( M2, ONE / NORM, X2, INCX2 ) + CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, + $ LDQ2, WORK, LWORK, CHILDINFO ) +* +* If the projection is nonzero, then return +* + IF( SNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN + RETURN + END IF END IF * * Project each standard basis vector e_1,...,e_M1 in turn, stopping @@ -238,8 +258,8 @@ END DO CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( SNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( SNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO @@ -257,8 +277,8 @@ X2(I) = ONE CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( SNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( SNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO diff --git a/lapack-netlib/SRC/sorbdb6.f b/lapack-netlib/SRC/sorbdb6.f index d320c9e46..eac177722 100644 --- a/lapack-netlib/SRC/sorbdb6.f +++ b/lapack-netlib/SRC/sorbdb6.f @@ -41,9 +41,8 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The Euclidean norm of X must be one and the columns of Q must be -*> orthonormal. The orthogonalized vector will be zero if and only if it -*> lies entirely in the range of Q. +*> The columns of Q must be orthonormal. The orthogonalized vector will +*> be zero if and only if it lies entirely in the range of Q. *> *> The projection is computed with at most two iterations of the *> classical Gram-Schmidt algorithm, see @@ -174,7 +173,7 @@ * * .. Parameters .. REAL ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.1E0, REALONE = 1.0E0, + PARAMETER ( ALPHA = 0.83E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) REAL NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0E0, ONE = 1.0E0, ZERO = 0.0E0 ) @@ -222,14 +221,16 @@ * EPS = SLAMCH( 'Precision' ) * +* Compute the Euclidean norm of X +* + SCL = REALZERO + SSQ = REALZERO + CALL SLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL SLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) +* * First, project X onto the orthogonal complement of Q's column * space -* -* Christoph Conrads: In debugging mode the norm should be computed -* and an assertion added comparing the norm with one. Alas, Fortran -* never made it into 1989 when assert() was introduced into the C -* programming language. - NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N diff --git a/lapack-netlib/SRC/zlarfgp.f b/lapack-netlib/SRC/zlarfgp.f index 77eba8e86..6c9efb04c 100644 --- a/lapack-netlib/SRC/zlarfgp.f +++ b/lapack-netlib/SRC/zlarfgp.f @@ -97,7 +97,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERauxiliary +*> \ingroup larfgp * * ===================================================================== SUBROUTINE ZLARFGP( N, ALPHA, X, INCX, TAU ) @@ -122,7 +122,7 @@ * .. * .. Local Scalars .. INTEGER J, KNT - DOUBLE PRECISION ALPHI, ALPHR, BETA, BIGNUM, SMLNUM, XNORM + DOUBLE PRECISION ALPHI, ALPHR, BETA, BIGNUM, EPS, SMLNUM, XNORM COMPLEX*16 SAVEALPHA * .. * .. External Functions .. @@ -143,11 +143,12 @@ RETURN END IF * + EPS = DLAMCH( 'Precision' ) XNORM = DZNRM2( N-1, X, INCX ) ALPHR = DBLE( ALPHA ) ALPHI = DIMAG( ALPHA ) * - IF( XNORM.EQ.ZERO ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * diff --git a/lapack-netlib/SRC/zunbdb5.f b/lapack-netlib/SRC/zunbdb5.f index 23174fe50..c451ae921 100644 --- a/lapack-netlib/SRC/zunbdb5.f +++ b/lapack-netlib/SRC/zunbdb5.f @@ -148,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup unbdb5 * * ===================================================================== SUBROUTINE ZUNBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, @@ -169,18 +169,21 @@ * ===================================================================== * * .. Parameters .. + DOUBLE PRECISION REALZERO + PARAMETER ( REALZERO = 0.0D0 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = (1.0D0,0.0D0), ZERO = (0.0D0,0.0D0) ) * .. * .. Local Scalars .. INTEGER CHILDINFO, I, J + DOUBLE PRECISION EPS, NORM, SCL, SSQ * .. * .. External Subroutines .. - EXTERNAL ZUNBDB6, XERBLA + EXTERNAL ZLASSQ, ZUNBDB6, ZSCAL, XERBLA * .. * .. External Functions .. - DOUBLE PRECISION DZNRM2 - EXTERNAL DZNRM2 + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DLAMCH, DZNRM2 * .. * .. Intrinsic Function .. INTRINSIC MAX @@ -213,16 +216,33 @@ RETURN END IF * -* Project X onto the orthogonal complement of Q + EPS = DLAMCH( 'Precision' ) * - CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2, - $ WORK, LWORK, CHILDINFO ) +* Project X onto the orthogonal complement of Q if X is nonzero * -* If the projection is nonzero, then return + SCL = REALZERO + SSQ = REALZERO + CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) * - IF( DZNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN - RETURN + IF( NORM .GT. N * EPS ) THEN +* Scale vector to unit norm to avoid problems in the caller code. +* Computing the reciprocal is undesirable but +* * xLASCL cannot be used because of the vector increments and +* * the round-off error has a negligible impact on +* orthogonalization. + CALL ZSCAL( M1, ONE / NORM, X1, INCX1 ) + CALL ZSCAL( M2, ONE / NORM, X2, INCX2 ) + CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, + $ LDQ2, WORK, LWORK, CHILDINFO ) +* +* If the projection is nonzero, then return +* + IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN + RETURN + END IF END IF * * Project each standard basis vector e_1,...,e_M1 in turn, stopping @@ -238,8 +258,8 @@ END DO CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( DZNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO @@ -257,8 +277,8 @@ X2(I) = ONE CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, $ LDQ2, WORK, LWORK, CHILDINFO ) - IF( DZNRM2(M1,X1,INCX1) .NE. ZERO - $ .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN + IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO + $ .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN RETURN END IF END DO diff --git a/lapack-netlib/SRC/zunbdb6.f b/lapack-netlib/SRC/zunbdb6.f index ac7fa4be3..ddc9dfc61 100644 --- a/lapack-netlib/SRC/zunbdb6.f +++ b/lapack-netlib/SRC/zunbdb6.f @@ -41,9 +41,8 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The Euclidean norm of X must be one and the columns of Q must be -*> orthonormal. The orthogonalized vector will be zero if and only if it -*> lies entirely in the range of Q. +*> The columns of Q must be orthonormal. The orthogonalized vector will +*> be zero if and only if it lies entirely in the range of Q. *> *> The projection is computed with at most two iterations of the *> classical Gram-Schmidt algorithm, see @@ -174,7 +173,7 @@ * * .. Parameters .. DOUBLE PRECISION ALPHA, REALONE, REALZERO - PARAMETER ( ALPHA = 0.1D0, REALONE = 1.0D0, + PARAMETER ( ALPHA = 0.83D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) COMPLEX*16 NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0D0,0.0D0), ONE = (1.0D0,0.0D0), @@ -223,14 +222,16 @@ * EPS = DLAMCH( 'Precision' ) * +* Compute the Euclidean norm of X +* + SCL = REALZERO + SSQ = REALZERO + CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM = SCL * SQRT( SSQ ) +* * First, project X onto the orthogonal complement of Q's column * space -* -* Christoph Conrads: In debugging mode the norm should be computed -* and an assertion added comparing the norm with one. Alas, Fortran -* never made it into 1989 when assert() was introduced into the C -* programming language. - NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N From 9b5f8eb33a263afafb26746a9d188018bba3b3b2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 19:35:53 +0100 Subject: [PATCH 408/718] Fix empty function prototypes --- driver/others/blas_server_omp.c | 2 +- driver/others/dynamic_zarch.c | 2 +- driver/others/memory_qalloc.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index f7008fb08..213531057 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -126,7 +126,7 @@ void openblas_set_num_threads(int num_threads) { int blas_thread_init(void){ #if defined(__FreeBSD__) && defined(__clang__) -extern int openblas_omp_num_threads_env(); +extern int openblas_omp_num_threads_env(void); if(blas_omp_number_max <= 0) blas_omp_number_max= openblas_omp_num_threads_env(); diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 5b45aae2f..dd26c8e80 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -13,7 +13,7 @@ extern gotoblas_t gotoblas_Z14; #define NUM_CORETYPES 4 -extern int openblas_verbose(); +extern int openblas_verbose(void); extern void openblas_warning(int verbose, const char* msg); char* gotoblas_corename(void) { diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c index 6174d9b75..a2593e01f 100644 --- a/driver/others/memory_qalloc.c +++ b/driver/others/memory_qalloc.c @@ -288,7 +288,7 @@ int goto_get_num_procs (void) { return blas_cpu_number; } -void openblas_fork_handler() +void openblas_fork_handler(void) { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is // built with "make USE_OPENMP=0". @@ -305,9 +305,9 @@ void openblas_fork_handler() #endif } -extern int openblas_num_threads_env(); -extern int openblas_goto_num_threads_env(); -extern int openblas_omp_num_threads_env(); +extern int openblas_num_threads_env(void); +extern int openblas_goto_num_threads_env(void); +extern int openblas_omp_num_threads_env(void); int blas_get_cpu_number(void){ #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) From c245c12dc232f8474e97b9ceda7a6e276f5c73b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 22:17:39 +0100 Subject: [PATCH 409/718] Update Changelog for 0.3.25 (#4314) * Update Changelog.txt for 0.3.25 --- Changelog.txt | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 3937ef08c..e0fe0ca5a 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,50 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.25 + 12-Nov-2023 + +general: +- improved the error message shown on exceeding the maximum thread count +- improved the code to add supplementary thread buffers in case of overflow +- fixed a potential division by zero in ?ROTG +- improved the ?MATCOPY functions to accept zero-sized rows or columns +- corrected empty prototypes in function declarations +- cleaned up unused declarations in the f2c-converted versions of the LAPACK sources +- fixed compilation with the Cray CCE Compiler suite +- improved link line rewriting to avoid mixed libgomp/libomp builds with clang&gfortran +- worked around OPENMP builds with LLVM14's libomp hanging on FreeBSD +- improved the Makefiles to require less option duplication on "make install" +- imported the following changes from the upcoming release 3.12 of Reference-LAPACK + - deprecate utility functions ?GELQS and ?GEQRS (LAPACK PR 900) + - apply rounding up to workspace calculations done in floating point (LAPACK PR 904) + - avoid overflow in STGEX2/DTGEX2 (LAPACK PR 907) + - fix accumulation in ?LASSQ (LAPACK PR 909) + - fix handling of NaN values in ?GECON (LAPACK PR 926) + - avoid overflow in CBDSQR/ZBDSQR (LAPACK PR 927) + - fix poor vector orthogonalizations in ?ORBDB5/?UNBDB5 (LAPACK PR 928 & 930) + +x86-64: +- fixed compile-time autodetection of AMD Ryzen3 and Ryzen4 cpus +- fixed capability-based fallback selection for unknown cpus in DYNAMIC_ARCH +- added AVX512 optimizations for ?ASUM on Sapphire Rapids and Cooper Lake + +ARM64: +- fixed building on Apple with homebrew gcc +- fixed building with XCODE 15 +- fixed building on A64FX and Cortex A710/X1/X2 +- increased the default buffer size for recent ARM server cpus + +POWER: +- fixed building with the IBM xlf 16.1.1 compiler +- fixed building with IBM XL C +- added support for DYNAMIC_ARCH builds with clang +- fixed union declaration in the BFLOAT16 test case +- enable optimizations for the AIX assembler on POWER10 + +LOONGARCH64: +- added an optimized SGEMV kernel +- added an optimized DTRSM kernel + ==================================================================== Version 0.3.24 03-Sep-2023 From e1f529d0247a4bc1dd9d1f86f4be10c2cfbd2990 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 22:37:11 +0100 Subject: [PATCH 410/718] Add OSX hw.cpufamily value for Apple M3 --- cpuid_arm64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index e586f9a3c..8c5d04c14 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -270,6 +270,7 @@ int detect(void) sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 + if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 #endif return CPU_ARMV8; #endif From f1940010e4f219cb7859f2fd495dc27a3afa3425 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 22:51:26 +0100 Subject: [PATCH 411/718] Update version to 0.3.25 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 444baa114..6b65871f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 24) +set(OpenBLAS_PATCH_VERSION 25) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 0e54cbd18c09afb66254157100d59cdb5da92c00 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 22:52:05 +0100 Subject: [PATCH 412/718] Update version to 0.3.25 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 80060a0fc..fd44b1b12 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.24 +VERSION = 0.3.25 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From dff686a86c1a4b59ea4604251c9f8c6648047807 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 23:06:46 +0100 Subject: [PATCH 413/718] Update version to 0.3.25.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b65871f8..69077322a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 25) +set(OpenBLAS_PATCH_VERSION 25.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From f4cc1b7a6f976ddb985dcfe00f352b519ab209a5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Nov 2023 23:07:19 +0100 Subject: [PATCH 414/718] Update version to 0.3.25.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index fd44b1b12..58f02358e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.25 +VERSION = 0.3.25.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From d99aad8ee308600832da39105a6511275cfe32ad Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 14 Nov 2023 11:07:08 -0600 Subject: [PATCH 415/718] Fix older version of gcc - missing __has_builtin, cpuid and no support of P10. --- Makefile.power | 12 ++++++++++++ driver/others/dynamic_power.c | 17 ++++++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/Makefile.power b/Makefile.power index 95bada34f..aa1ca080a 100644 --- a/Makefile.power +++ b/Makefile.power @@ -11,7 +11,19 @@ endif ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) +ifeq ($(C_COMPILER), GCC)) +ifeq ($(GCCVERSIONGTEQ10), 1) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +else ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math +else +$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended) +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math +endif +else +CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +endif ifeq ($(F_COMPILER), IBM) FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize else diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index f0faf2baf..0454f186c 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -66,8 +66,7 @@ static int cpuid(void) #endif return CPU_UNKNOWN; } -#else -#if defined(C_PGI) || defined(__clang__) +#elif defined(C_PGI) || defined(__clang__) /* * NV HPC compilers do not yet implement __builtin_cpu_is(). * Fake a version here for use in the CPU detection code below. @@ -196,13 +195,21 @@ static int cpuid(void) cpu_type = pvrPOWER[i].cpu_type; return (int)(cpu_type); } -#endif /* C_PGI */ +#elif !defined(__BUILTIN_CPU_SUPPORTS__) +static int cpuid(void) +{ + return CPU_UNKNOWN; +} #endif /* _AIX */ #ifndef __BUILTIN_CPU_SUPPORTS__ #include -#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_is)) +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#if defined(_AIX) || !__has_builtin(__builtin_cpu_is) static int __builtin_cpu_is(const char *arg) { static int ipinfo = -1; @@ -227,7 +234,7 @@ static int __builtin_cpu_is(const char *arg) } #endif -#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports)) +#if defined(_AIX) || !__has_builtin(__builtin_cpu_supports) static int __builtin_cpu_supports(const char *arg) { return 0; From 23cda457fb0306d0aba39c3e19a1ced3fa0116c9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Nov 2023 09:48:23 +0100 Subject: [PATCH 416/718] Implement truncated QR with pivoting (Reference-LAPACK PR 891) --- lapack-netlib/SRC/cgeqp3rk.f | 1091 ++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/claqp2rk.f | 726 ++++++++++++++++++++++ lapack-netlib/SRC/claqp3rk.f | 947 +++++++++++++++++++++++++++++ lapack-netlib/SRC/dgeqp3rk.f | 1081 +++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dlaqp2rk.f | 713 ++++++++++++++++++++++ lapack-netlib/SRC/dlaqp3rk.f | 935 +++++++++++++++++++++++++++++ lapack-netlib/SRC/ilaenv.f | 21 +- lapack-netlib/SRC/sgeqp3rk.f | 1081 +++++++++++++++++++++++++++++++++ lapack-netlib/SRC/slaqp2rk.f | 713 ++++++++++++++++++++++ lapack-netlib/SRC/slaqp3rk.f | 935 +++++++++++++++++++++++++++++ lapack-netlib/SRC/zgeqp3rk.f | 1091 ++++++++++++++++++++++++++++++++++ lapack-netlib/SRC/zlaqp2rk.f | 726 ++++++++++++++++++++++ lapack-netlib/SRC/zlaqp3rk.f | 947 +++++++++++++++++++++++++++++ 13 files changed, 11006 insertions(+), 1 deletion(-) create mode 100644 lapack-netlib/SRC/cgeqp3rk.f create mode 100644 lapack-netlib/SRC/claqp2rk.f create mode 100644 lapack-netlib/SRC/claqp3rk.f create mode 100644 lapack-netlib/SRC/dgeqp3rk.f create mode 100644 lapack-netlib/SRC/dlaqp2rk.f create mode 100644 lapack-netlib/SRC/dlaqp3rk.f create mode 100644 lapack-netlib/SRC/sgeqp3rk.f create mode 100644 lapack-netlib/SRC/slaqp2rk.f create mode 100644 lapack-netlib/SRC/slaqp3rk.f create mode 100644 lapack-netlib/SRC/zgeqp3rk.f create mode 100644 lapack-netlib/SRC/zlaqp2rk.f create mode 100644 lapack-netlib/SRC/zlaqp3rk.f diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f new file mode 100644 index 000000000..70789e64f --- /dev/null +++ b/lapack-netlib/SRC/cgeqp3rk.f @@ -0,0 +1,1091 @@ +*> \brief \b CGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, RWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL RWORK( * ) +* COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a complex +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M orthogonal matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS +*> matrix B with Q(K)**H * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**H * B, the matrix B with the orthogonal +*> transformation Q(K)**H applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M orthogonal matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**H, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a complex scalar, +*> v is a complex vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = DLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> orthogonal matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**H * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*. LWORK >= N+NRHS-1 +*> For optimal performance LWORK >= NB*( N+NRHS+1 ), +*> where NB is the optimal block size for CGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine CLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> CGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in CGEQP3 routine which uses +*> CLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, RWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL RWORK( * ) + COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + REAL ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) + COMPLEX CZERO + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + REAL EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL CLAQP2RK, CLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX, ILAENV + REAL SLAMCH, SCNRM2 + EXTERNAL SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( SISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( SISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in CLAQP2RK. +* 1) CLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in CLARF subroutine inside CLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'CGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in CLAQP2RK and blocked BLAS 3 code +* in CLAQP3RK. +* 1) CGEQP3RK, CLAQP2RK, CLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) CLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in CLARF subroutine to apply an elementary reflector +* from the left. +* 3) CLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) CLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = CMPLX( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = CMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + RWORK( J ) = SCNRM2( M, A( 1, J ), 1 ) + RWORK( N+J ) = RWORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = ISAMAX( N, RWORK( 1 ), 1 ) +* +* ==================================================================. +* + IF( SISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = CMPLX( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = CMPLX( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = SLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = CZERO + END DO + WORK( 1 ) = CMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = SLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = SLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = CMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'CGEQP3RK', ' ', M, N, -1, -1 ) ) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'CGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL CLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), WORK( JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = CMPLX( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL CLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + ISAMAX( N-K, RWORK( K+1 ), 1 ) + MAXC2NRMK = RWORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = CZERO + END DO +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = CMPLX( LWKOPT ) +* + RETURN +* +* End of CGEQP3RK +* + END diff --git a/lapack-netlib/SRC/claqp2rk.f b/lapack-netlib/SRC/claqp2rk.f new file mode 100644 index 000000000..073ad0f88 --- /dev/null +++ b/lapack-netlib/SRC/claqp2rk.f @@ -0,0 +1,726 @@ +*> \brief \b CLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* REAL VN1( * ), VN2( * ) +* COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* $ +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of the complex matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine CGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is REAL +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine CGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (N-1) +*> Used in CLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + REAL VN1( * ), VN2( * ) + COMPLEX A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), + $ CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + REAL HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX AIKK +* .. +* .. External Subroutines .. + EXTERNAL CLARF, CLARFG, CSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SCNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SCNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL CSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = CZERO. +* + IF( I.LT.M ) THEN + CALL CLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = CZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since CLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by CLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by CLARFG is covered by checking TAU(KK) for NaN. +* + IF( SISNAN( REAL( TAU(KK) ) ) ) THEN + TAUNAN = REAL( TAU(KK) ) + ELSE IF( SISNAN( IMAG( TAU(KK) ) ) ) THEN + TAUNAN = IMAG( TAU(KK) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( SISNAN( TAUNAN ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = CONE + CALL CLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ CONJG( TAU( KK ) ), A( I, KK+1 ), LDA, + $ WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = SCNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = CZERO + END DO +* + RETURN +* +* End of CLAQP2RK +* + END diff --git a/lapack-netlib/SRC/claqp3rk.f b/lapack-netlib/SRC/claqp3rk.f new file mode 100644 index 000000000..af5e85645 --- /dev/null +++ b/lapack-netlib/SRC/claqp3rk.f @@ -0,0 +1,947 @@ +*> \brief \b CLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL VN1( * ), VN2( * ) +* COMPLEX*16 A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine CGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is REAL +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine CGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is COMPLEX array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is COMPLEX array, dimension (LDF,NB) +*> Matrix F**H = L*(Y**H)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL VN1( * ), VN2( * ) + COMPLEX A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), + $ CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + REAL HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX AIK +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CGEMV, CLARFG, CSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SCNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SCNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL CSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL CSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. +* + IF( K.GT.1 ) THEN + DO J = 1, K - 1 + F( K, J ) = CONJG( F( K, J ) ) + END DO + CALL CGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 ) + DO J = 1, K - 1 + F( K, J ) = CONJG( F( K, J ) ) + END DO + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL CLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = CZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since CLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by CLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by CLARFG is covered by checking TAU(K) for NaN. +* + IF( SISNAN( REAL( TAU(K) ) ) ) THEN + TAUNAN = REAL( TAU(K) ) + ELSE IF( SISNAN( IMAG( TAU(K) ) ) ) THEN + TAUNAN = IMAG( TAU(K) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( SISNAN( TAUNAN ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = CONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL CGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ CZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = CZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL CGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, CZERO, + $ AUXV( 1 ), 1 ) +* + CALL CGEMV( 'No transpose', N+NRHS, K-1, CONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. +* + IF( K.LT.N+NRHS ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ 1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA, + $ F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* SCNRM2 does not fail on vectors with norm below the value of +* SQRT(SLAMCH('S')) +* + VN1( LSTICC ) = SCNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of CLAQP3RK +* + END diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f new file mode 100644 index 000000000..ace97b712 --- /dev/null +++ b/lapack-netlib/SRC/dgeqp3rk.f @@ -0,0 +1,1081 @@ +*> \brief \b DGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a real +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M orthogonal matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a real M-by-NRHS +*> matrix B with Q(K)**T * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**T * B, the matrix B with the orthogonal +*> transformation Q(K)**T applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M orthogonal matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**T, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a real scalar, +*> v is a real vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = DLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> orthogonal matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**T * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*. LWORK >= (3*N + NRHS - 1) +*> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), +*> where NB is the optimal block size for DGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine DLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> DGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in DGEQP3 routine which uses +*> DLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + DOUBLE PRECISION ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + DOUBLE PRECISION EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL DLAQP2RK, DLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX, ILAENV + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DISNAN, DLAMCH, DNRM2, IDAMAX, ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( DISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( DISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in DLAQP2RK. +* 1) DGEQP3RK and DLAQP2RK: 2*N to store full and partial +* column 2-norms. +* 2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in DLARF subroutine inside DLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = 3*N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'DGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in DLAQP2RK and blocked BLAS 3 code +* in DLAQP3RK. +* 1) DGEQP3RK, DLAQP2RK, DLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in DLARF subroutine to apply an elementary reflector +* from the left. +* 3) DLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) DLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = DBLE( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + WORK( J ) = DNRM2( M, A( 1, J ), 1 ) + WORK( N+J ) = WORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = IDAMAX( N, WORK( 1 ), 1 ) + MAXC2NRM = WORK( KP1 ) +* +* ==================================================================. +* + IF( DISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = DBLE( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = DLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = ZERO + END DO + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = DLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = DLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = DBLE( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'DGEQP3RK', ' ', M, N, -1, -1 )) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'DGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL DLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), WORK( 2*N+JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = DBLE( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL DLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + IDAMAX( N-K, WORK( K+1 ), 1 ) + MAXC2NRMK = WORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = ZERO + END DO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = DBLE( LWKOPT ) +* + RETURN +* +* End of DGEQP3RK +* + END diff --git a/lapack-netlib/SRC/dlaqp2rk.f b/lapack-netlib/SRC/dlaqp2rk.f new file mode 100644 index 000000000..b5a84d0de --- /dev/null +++ b/lapack-netlib/SRC/dlaqp2rk.f @@ -0,0 +1,713 @@ +*> \brief \b DLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* DOUBLE PRECISION A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), +* $ WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of a real matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine DGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine DGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (N-1) +*> Used in DLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), + $ WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + DOUBLE PRECISION AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL DLARF, DLARFG, DSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL DSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = ZERO. +* + IF( I.LT.M ) THEN + CALL DLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = ZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since DLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by DLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by DLARFG is covered by checking TAU(KK) for NaN. +* + IF( DISNAN( TAU(KK) ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( KK ) + RELMAXC2NRMK = TAU( KK ) +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = ONE + CALL DLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = DNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = ZERO + END DO +* + RETURN +* +* End of DLAQP2RK +* + END diff --git a/lapack-netlib/SRC/dlaqp3rk.f b/lapack-netlib/SRC/dlaqp3rk.f new file mode 100644 index 000000000..39e617d0e --- /dev/null +++ b/lapack-netlib/SRC/dlaqp3rk.f @@ -0,0 +1,935 @@ +*> \brief \b DLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* +* .. Scalar Arguments .. +* LOGICAL DONE +* INTEGER KB, LDA, LDF, M, N, NB, NRHS, IOFFSET +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), +* $ VN1( * ), VN2( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine DGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine DGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is DOUBLE PRECISION array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is DOUBLE PRECISION array, dimension (LDF,NB) +*> Matrix F**T = L*(Y**T)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), + $ VN1( * ), VN2( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + DOUBLE PRECISION AIK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DGEMV, DLARFG, DSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL DSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL DSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. +* + IF( K.GT.1 ) THEN + CALL DGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 ) + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL DLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = ZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since DLARFG cannot produce TAU(K) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by DLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by DLARFG is covered by checking TAU(K) for NaN. +* + IF( DISNAN( TAU(K) ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( K ) + RELMAXC2NRMK = TAU( K ) +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = ONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL DGEMV( 'Transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ ZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = ZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL DGEMV( 'Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, ZERO, + $ AUXV( 1 ), 1 ) +* + CALL DGEMV( 'No transpose', N+NRHS, K-1, ONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. +* + IF( K.LT.N+NRHS ) THEN + CALL DGEMV( 'No transpose', N+NRHS-K, K, -ONE, + $ F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE, + $ A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL DGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* DNRM2 does not fail on vectors with norm below the value of +* SQRT(DLAMCH('S')) +* + VN1( LSTICC ) = DNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of DLAQP3RK +* + END diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f index a639e0375..e74a2b35e 100644 --- a/lapack-netlib/SRC/ilaenv.f +++ b/lapack-netlib/SRC/ilaenv.f @@ -132,7 +132,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup OTHERauxiliary +*> \ingroup ilaenv * *> \par Further Details: * ===================== @@ -355,6 +355,12 @@ ELSE NB = 64 END IF + ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN + IF( SNAME ) THEN + NB = 32 + ELSE + NB = 32 + END IF END IF ELSE IF( C2.EQ.'PO' ) THEN IF( C3.EQ.'TRF' ) THEN @@ -541,7 +547,14 @@ ELSE NBMIN = 2 END IF + ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN + IF( SNAME ) THEN + NBMIN = 2 + ELSE + NBMIN = 2 + END IF END IF + ELSE IF( C2.EQ.'SY' ) THEN IF( C3.EQ.'TRF' ) THEN IF( SNAME ) THEN @@ -618,6 +631,12 @@ ELSE NX = 128 END IF + ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN + IF( SNAME ) THEN + NX = 128 + ELSE + NX = 128 + END IF END IF ELSE IF( C2.EQ.'SY' ) THEN IF( SNAME .AND. C3.EQ.'TRD' ) THEN diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f new file mode 100644 index 000000000..17559c7f4 --- /dev/null +++ b/lapack-netlib/SRC/sgeqp3rk.f @@ -0,0 +1,1081 @@ +*> \brief \b SGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a real +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M orthogonal matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a real M-by-NRHS +*> matrix B with Q(K)**T * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**T * B, the matrix B with the orthogonal +*> transformation Q(K)**T applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M orthogonal matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**T, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a real scalar, +*> v is a real vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = SLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = SLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> orthogonal matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**T * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*. LWORK >= (3*N + NRHS - 1) +*> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), +*> where NB is the optimal block size for SGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine SLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> SGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in SGEQP3 routine which uses +*> SLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + REAL ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + REAL ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + REAL EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL SLAQP2RK, SLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX, ILAENV + REAL SLAMCH, SNRM2 + EXTERNAL SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( SISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( SISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in SLAQP2RK. +* 1) SGEQP3RK and SLAQP2RK: 2*N to store full and partial +* column 2-norms. +* 2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in SLARF subroutine inside SLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = 3*N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'SGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in SLAQP2RK and blocked BLAS 3 code +* in SLAQP3RK. +* 1) SGEQP3RK, SLAQP2RK, SLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in SLARF subroutine to apply an elementary reflector +* from the left. +* 3) SLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) SLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = REAL( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = REAL( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + WORK( J ) = SNRM2( M, A( 1, J ), 1 ) + WORK( N+J ) = WORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = ISAMAX( N, WORK( 1 ), 1 ) + MAXC2NRM = WORK( KP1 ) +* +* ==================================================================. +* + IF( SISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = REAL( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = REAL( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = SLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = ZERO + END DO + WORK( 1 ) = REAL( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = SLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = SLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = ZERO + END DO +* + WORK( 1 ) = REAL( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'SGEQP3RK', ' ', M, N, -1, -1 )) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'SGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL SLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), WORK( 2*N+JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = REAL( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL SLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), WORK( J ), WORK( N+J ), + $ WORK( 2*N+1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + ISAMAX( N-K, WORK( K+1 ), 1 ) + MAXC2NRMK = WORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = ZERO + END DO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = REAL( LWKOPT ) +* + RETURN +* +* End of SGEQP3RK +* + END diff --git a/lapack-netlib/SRC/slaqp2rk.f b/lapack-netlib/SRC/slaqp2rk.f new file mode 100644 index 000000000..d3dbb3d7c --- /dev/null +++ b/lapack-netlib/SRC/slaqp2rk.f @@ -0,0 +1,713 @@ +*> \brief \b SLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* REAL A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), +* $ WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of a real matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine SGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine SGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (N-1) +*> Used in SLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + REAL A( LDA, * ), TAU( * ), VN1( * ), VN2( * ), + $ WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + REAL AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL SLARF, SLARFG, SSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL SSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = ZERO. +* + IF( I.LT.M ) THEN + CALL SLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = ZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since SLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by SLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by SLARFG is covered by checking TAU(KK) for NaN. +* + IF( SISNAN( TAU(KK) ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( KK ) + RELMAXC2NRMK = TAU( KK ) +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = ONE + CALL SLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = SNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = ZERO + END DO +* + RETURN +* +* End of SLAQP2RK +* + END diff --git a/lapack-netlib/SRC/slaqp3rk.f b/lapack-netlib/SRC/slaqp3rk.f new file mode 100644 index 000000000..fa735bb9d --- /dev/null +++ b/lapack-netlib/SRC/slaqp3rk.f @@ -0,0 +1,935 @@ +*> \brief \b SLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* +* .. Scalar Arguments .. +* LOGICAL DONE +* INTEGER KB, LDA, LDF, M, N, NB, NRHS, IOFFSET +* REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* REAL A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), +* $ VN1( * ), VN2( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is REAL, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is REAL, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine SGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is REAL +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine SGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**T. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is REAL +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is REAL +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is REAL array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is REAL array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is REAL array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is REAL array, dimension (LDF,NB) +*> Matrix F**T = L*(Y**T)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + REAL ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + REAL A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ), + $ VN1( * ), VN2( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + REAL AIK, HUGEVAL, TEMP, TEMP2, TOL3Z +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SGEMV, SLARFG, SSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL SISNAN + INTEGER ISAMAX + REAL SLAMCH, SNRM2 + EXTERNAL SISNAN, SLAMCH, ISAMAX, SNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( SLAMCH( 'Epsilon' ) ) + HUGEVAL = SLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( SISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. +* + DO J = K, MINMNFACT + TAU( J ) = ZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL SSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL SSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. +* + IF( K.GT.1 ) THEN + CALL SGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 ) + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL SLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = ZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since SLARFG cannot produce TAU(K) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by SLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by SLARFG is covered by checking TAU(K) for NaN. +* + IF( SISNAN( TAU(K) ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAU( K ) + RELMAXC2NRMK = TAU( K ) +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = ONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL SGEMV( 'Transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ ZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = ZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL SGEMV( 'Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, ZERO, + $ AUXV( 1 ), 1 ) +* + CALL SGEMV( 'No transpose', N+NRHS, K-1, ONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. +* + IF( K.LT.N+NRHS ) THEN + CALL SGEMV( 'No transpose', N+NRHS-K, K, -ONE, + $ F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE, + $ A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL SGEMM( 'No transpose', 'Transpose', + $ M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* SNRM2 does not fail on vectors with norm below the value of +* SQRT(SLAMCH('S')) +* + VN1( LSTICC ) = SNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of SLAQP3RK +* + END diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f new file mode 100644 index 000000000..f8ef986c7 --- /dev/null +++ b/lapack-netlib/SRC/zgeqp3rk.f @@ -0,0 +1,1091 @@ +*> \brief \b ZGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGEQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, +* $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ WORK, LWORK, RWORK, IWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, K, KMAX, LDA, LWORK, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION RWORK( * ) +* COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGEQP3RK performs two tasks simultaneously: +*> +*> Task 1: The routine computes a truncated (rank K) or full rank +*> Householder QR factorization with column pivoting of a complex +*> M-by-N matrix A using Level 3 BLAS. K is the number of columns +*> that were factorized, i.e. factorization rank of the +*> factor R, K <= min(M,N). +*> +*> A * P(K) = Q(K) * R(K) = +*> +*> = Q(K) * ( R11(K) R12(K) ) = Q(K) * ( R(K)_approx ) +*> ( 0 R22(K) ) ( 0 R(K)_residual ), +*> +*> where: +*> +*> P(K) is an N-by-N permutation matrix; +*> Q(K) is an M-by-M orthogonal matrix; +*> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the +*> full rank factor R with K-by-K upper-triangular +*> R11(K) and K-by-N rectangular R12(K). The diagonal +*> entries of R11(K) appear in non-increasing order +*> of absolute value, and absolute values of all of +*> them exceed the maximum column 2-norm of R22(K) +*> up to roundoff error. +*> R(K)_residual = R22(K) is the residual of a rank K approximation +*> of the full rank factor R. It is a +*> an (M-K)-by-(N-K) rectangular matrix; +*> 0 is a an (M-K)-by-K zero matrix. +*> +*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS +*> matrix B with Q(K)**H * B using Level 3 BLAS. +*> +*> ===================================================================== +*> +*> The matrices A and B are stored on input in the array A as +*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS) +*> respectively. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> The truncation criteria (i.e. when to stop the factorization) +*> can be any of the following: +*> +*> 1) The input parameter KMAX, the maximum number of columns +*> KMAX to factorize, i.e. the factorization rank is limited +*> to KMAX. If KMAX >= min(M,N), the criterion is not used. +*> +*> 2) The input parameter ABSTOL, the absolute tolerance for +*> the maximum column 2-norm of the residual matrix R22(K). This +*> means that the factorization stops if this norm is less or +*> equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used. +*> +*> 3) The input parameter RELTOL, the tolerance for the maximum +*> column 2-norm matrix of the residual matrix R22(K) divided +*> by the maximum column 2-norm of the original matrix A, which +*> is equal to abs(R(1,1)). This means that the factorization stops +*> when the ratio of the maximum column 2-norm of R22(K) to +*> the maximum column 2-norm of A is less than or equal to RELTOL. +*> If RELTOL < 0.0, the criterion is not used. +*> +*> 4) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix R22(K) is a zero matrix in some +*> factorization step K. ( This stopping criterion is implicit. ) +*> +*> The algorithm stops when any of these conditions is first +*> satisfied, otherwise the whole matrix A is factorized. +*> +*> To factorize the whole matrix A, use the values +*> KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0. +*> +*> The routine returns: +*> a) Q(K), R(K)_approx = ( R11(K), R12(K) ), +*> R(K)_residual = R22(K), P(K), i.e. the resulting matrices +*> of the factorization; P(K) is represented by JPIV, +*> ( if K = min(M,N), R(K)_approx is the full factor R, +*> and there is no residual matrix R(K)_residual); +*> b) K, the number of columns that were factorized, +*> i.e. factorization rank; +*> c) MAXC2NRMK, the maximum column 2-norm of the residual +*> matrix R(K)_residual = R22(K), +*> ( if K = min(M,N), MAXC2NRMK = 0.0 ); +*> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum +*> column 2-norm of the original matrix A, which is equal +*> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); +*> e) Q(K)**H * B, the matrix B with the orthogonal +*> transformation Q(K)**H applied on the left. +*> +*> The N-by-N permutation matrix P(K) is stored in a compact form in +*> the integer array JPIV. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The M-by-M orthogonal matrix Q is represented as a product +*> of elementary Householder reflectors +*> +*> Q(K) = H(1) * H(2) * . . . * H(K), +*> +*> where K is the number of columns that were factorized. +*> +*> Each H(j) has the form +*> +*> H(j) = I - tau * v * v**H, +*> +*> where 1 <= j <= K and +*> I is an M-by-M identity matrix, +*> tau is a complex scalar, +*> v is a complex vector with v(1:j-1) = 0 and v(j) = 1. +*> +*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j). +*> +*> See the Further Details section for more information. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e. the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M,N), then this stopping criterion +*> is not used, the routine factorizes columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B are not modified, and +*> the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION +*> +*> The second factorization stopping criterion, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix R22(K). +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix R22(K) +*> is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S'). +*> +*> a) If ABSTOL is NaN, then no computation is performed +*> and an error message ( INFO = -5 ) is issued +*> by XERBLA. +*> +*> b) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN +*> is used. This includes the case ABSTOL = -0.0. +*> +*> d) If 2*SAFMIN <= ABSTOL then the input value +*> of ABSTOL is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If ABSTOL chosen above is >= MAXC2NRM, then this +*> stopping criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. The routine +*> returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case ABSTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION +*> +*> The third factorization stopping criterion, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio +*> abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of +*> the residual matrix R22(K) to the maximum column 2-norm of +*> the original matrix A. The algorithm converges (stops the +*> factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less +*> than or equal to RELTOL. Let EPS = DLAMCH('E'). +*> +*> a) If RELTOL is NaN, then no computation is performed +*> and an error message ( INFO = -6 ) is issued +*> by XERBLA. +*> +*> b) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used. +*> This includes the case RELTOL = -0.0. +*> +*> d) If EPS <= RELTOL then the input value of RELTOL +*> is used. +*> +*> Let MAXC2NRM be the maximum column 2-norm of the +*> whole original matrix A. +*> If RELTOL chosen above is >= 1.0, then this stopping +*> criterion is satisfied on input and routine exits +*> immediately after MAXC2NRM is computed. +*> The routine returns MAXC2NRM in MAXC2NORMK, +*> and 1.0 in RELMAXC2NORMK. +*> This includes the case RELTOL = +Inf. This means that the +*> factorization is not performed, the matrices A and B are not +*> modified, and the matrix A is itself the residual. +*> +*> NOTE: We recommend that RELTOL satisfy +*> min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N+NRHS) +*> +*> On entry: +*> +*> a) The subarray A(1:M,1:N) contains the M-by-N matrix A. +*> b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS +*> matrix B. +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> +*> a) The subarray A(1:M,1:N) contains parts of the factors +*> of the matrix A: +*> +*> 1) If K = 0, A(1:M,1:N) contains the original matrix A. +*> 2) If K > 0, A(1:M,1:N) contains parts of the +*> factors: +*> +*> 1. The elements below the diagonal of the subarray +*> A(1:M,1:K) together with TAU(1:K) represent the +*> orthogonal matrix Q(K) as a product of K Householder +*> elementary reflectors. +*> +*> 2. The elements on and above the diagonal of +*> the subarray A(1:K,1:N) contain K-by-N +*> upper-trapezoidal matrix +*> R(K)_approx = ( R11(K), R12(K) ). +*> NOTE: If K=min(M,N), i.e. full rank factorization, +*> then R_approx(K) is the full factor R which +*> is upper-trapezoidal. If, in addition, M>=N, +*> then R is upper-triangular. +*> +*> 3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K) +*> rectangular matrix R(K)_residual = R22(K). +*> +*> b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains +*> the M-by-NRHS product Q(K)**H * B. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> This is the leading dimension for both matrices, A and B. +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> +*> NOTE: If K = 0, a) the arrays A and B are not modified; +*> b) the array TAU(1:min(M,N)) is set to ZERO, +*> if the matrix A does not contain NaN, +*> otherwise the elements TAU(1:min(M,N)) +*> are undefined; +*> c) the elements of the array JPIV are set +*> as follows: for j = 1:N, JPIV(j) = j. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix R22(K), +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then MAXC2NRMK equals the maximum column 2-norm +*> of the original matrix A. +*> +*> b) If 0 < K < min(M,N), then MAXC2NRMK is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then MAXC2NRMK = 0.0. +*> +*> NOTE: MAXC2NRMK in the factorization step K would equal +*> R(K+1,K+1) in the next factorization step K+1. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix R22(K) (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> +*> a) If K = 0, i.e. the factorization was not performed, +*> the matrix A was not modified and is itself a residual +*> matrix, then RELMAXC2NRMK = 1.0. +*> +*> b) If 0 < K < min(M,N), then +*> RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned. +*> +*> c) If K = min(M,N), i.e. the whole matrix A was +*> factorized and there is no residual matrix, +*> then RELMAXC2NRMK = 0.0. +*> +*> NOTE: RELMAXC2NRMK in the factorization step K would equal +*> abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization +*> step K+1. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices. For 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> +*> The elements of the array JPIV(1:N) are always set +*> by the routine, for example, even when no columns +*> were factorized, i.e. when K = 0, the elements are +*> set as JPIV(j) = j for j = 1:N. +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (min(M,N)) +*> The scalar factors of the elementary reflectors. +*> +*> If 0 < K <= min(M,N), only the elements TAU(1:K) of +*> the array TAU are modified by the factorization. +*> After the factorization computed, if no NaN was found +*> during the factorization, the remaining elements +*> TAU(K+1:min(M,N)) are set to zero, otherwise the +*> elements TAU(K+1:min(M,N)) are not set and therefore +*> undefined. +*> ( If K = 0, all elements of TAU are set to zero, if +*> the matrix A does not contain NaN. ) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*. LWORK >= N+NRHS-1 +*> For optimal performance LWORK >= NB*( N+NRHS+1 ), +*> where NB is the optimal block size for ZGEQP3RK returned +*> by ILAENV. Minimal block size MINNB=2. +*> +*> NOTE: The decision, whether to use unblocked BLAS 2 +*> or blocked BLAS 3 code is based not only on the dimension +*> LWORK of the availbale workspace WORK, but also also on the +*> matrix A dimension N via crossover point NX returned +*> by ILAENV. (For N less than NX, unblocked code should be +*> used.) +*> +*> If LWORK = -1, then a workspace query is assumed; +*> the routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (2*N) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix in the blocked step auxiliary subroutine ZLAQP3RK ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) INFO < 0: if INFO = -i, the i-th argument had an +*> illegal value. +*> 3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup geqp3rk +* +*> \par Further Details: +* ===================== +* +*> \verbatim +*> ZGEQP3RK is based on the same BLAS3 Householder QR factorization +*> algorithm with column pivoting as in ZGEQP3 routine which uses +*> ZLARFG routine to generate Householder reflectors +*> for QR factorization. +*> +*> We can also write: +*> +*> A = A_approx(K) + A_residual(K) +*> +*> The low rank approximation matrix A(K)_approx from +*> the truncated QR factorization of rank K of the matrix A is: +*> +*> A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T +*> ( 0 0 ) +*> +*> = Q(K) * ( R11(K) R12(K) ) * P(K)**T +*> ( 0 0 ) +*> +*> The residual A_residual(K) of the matrix A is: +*> +*> A_residual(K) = Q(K) * ( 0 0 ) * P(K)**T = +*> ( 0 R(K)_residual ) +*> +*> = Q(K) * ( 0 0 ) * P(K)**T +*> ( 0 R22(K) ) +*> +*> The truncated (rank K) factorization guarantees that +*> the maximum column 2-norm of A_residual(K) is less than +*> or equal to MAXC2NRMK up to roundoff error. +*> +*> NOTE: An approximation of the null vectors +*> of A can be easily computed from R11(K) +*> and R12(K): +*> +*> Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) ) +*> ( -I ) +*> +*> \endverbatim +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA, + $ K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ WORK, LWORK, RWORK, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION RWORK( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER INB, INBMIN, IXOVER + PARAMETER ( INB = 1, INBMIN = 2, IXOVER = 3 ) + DOUBLE PRECISION ZERO, ONE, TWO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 ) + COMPLEX*16 CZERO + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, DONE + INTEGER IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX, + $ JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB, + $ NBMIN, NX + DOUBLE PRECISION EPS, HUGEVAL, MAXC2NRM, SAFMIN +* .. +* .. External Subroutines .. + EXTERNAL ZLAQP2RK, ZLAQP3RK, XERBLA +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX, ILAENV + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DISNAN, DLAMCH, DZNRM2, IDAMAX, ILAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test input arguments +* ==================== +* + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( KMAX.LT.0 ) THEN + INFO = -4 + ELSE IF( DISNAN( ABSTOL ) ) THEN + INFO = -5 + ELSE IF( DISNAN( RELTOL ) ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -8 + END IF +* +* If the input parameters M, N, NRHS, KMAX, LDA are valid: +* a) Test the input workspace size LWORK for the minimum +* size requirement IWS. +* b) Determine the optimal block size NB and optimal +* workspace size LWKOPT to be returned in WORK(1) +* in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., +* (3) when routine exits. +* Here, IWS is the miminum workspace required for unblocked +* code. +* + IF( INFO.EQ.0 ) THEN + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + IWS = 1 + LWKOPT = 1 + ELSE +* +* Minimal workspace size in case of using only unblocked +* BLAS 2 code in ZLAQP2RK. +* 1) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in ZLARF subroutine inside ZLAQP2RK to apply an +* elementary reflector from the left. +* TOTAL_WORK_SIZE = 3*N + NRHS - 1 +* + IWS = N + NRHS - 1 +* +* Assign to NB optimal block size. +* + NB = ILAENV( INB, 'ZGEQP3RK', ' ', M, N, -1, -1 ) +* +* A formula for the optimal workspace size in case of using +* both unblocked BLAS 2 in ZLAQP2RK and blocked BLAS 3 code +* in ZLAQP3RK. +* 1) ZGEQP3RK, ZLAQP2RK, ZLAQP3RK: 2*N to store full and +* partial column 2-norms. +* 2) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used +* in ZLARF subroutine to apply an elementary reflector +* from the left. +* 3) ZLAQP3RK: NB*(N+NRHS) to use in the work array F that +* is used to apply a block reflector from +* the left. +* 4) ZLAQP3RK: NB to use in the auxilixary array AUX. +* Sizes (2) and ((3) + (4)) should intersect, therefore +* TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. +* + LWKOPT = 2*N + NB*( N+NRHS+1 ) + END IF + WORK( 1 ) = DCMPLX( LWKOPT ) +* + IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN + INFO = -15 + END IF + END IF +* +* NOTE: The optimal workspace size is returned in WORK(1), if +* the input parameters M, N, NRHS, KMAX, LDA are valid. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGEQP3RK', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible for M=0 or N=0. +* + IF( MINMN.EQ.0 ) THEN + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* +* Initialize column pivot array JPIV. +* + DO J = 1, N + JPIV( J ) = J + END DO +* +* ================================================================== +* +* Initialize storage for partial and exact column 2-norms. +* a) The elements WORK(1:N) are used to store partial column +* 2-norms of the matrix A, and may decrease in each computation +* step; initialize to the values of complete columns 2-norms. +* b) The elements WORK(N+1:2*N) are used to store complete column +* 2-norms of the matrix A, they are not changed during the +* computation; initialize the values of complete columns 2-norms. +* + DO J = 1, N + RWORK( J ) = DZNRM2( M, A( 1, J ), 1 ) + RWORK( N+J ) = RWORK( J ) + END DO +* +* ================================================================== +* +* Compute the pivot column index and the maximum column 2-norm +* for the whole original matrix stored in A(1:M,1:N). +* + KP1 = IDAMAX( N, RWORK( 1 ), 1 ) +* +* ==================================================================. +* + IF( DISNAN( MAXC2NRM ) ) THEN +* +* Check if the matrix A contains NaN, set INFO parameter +* to the column number where the first NaN is found and return +* from the routine. +* + K = 0 + INFO = KP1 +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = MAXC2NRM +* +* Array TAU is not set and contains undefined elements. +* + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* =================================================================== +* + IF( MAXC2NRM.EQ.ZERO ) THEN +* +* Check is the matrix A is a zero matrix, set array TAU and +* return from the routine. +* + K = 0 + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN +* + END IF +* +* =================================================================== +* + HUGEVAL = DLAMCH( 'Overflow' ) +* + IF( MAXC2NRM.GT.HUGEVAL ) THEN +* +* Check if the matrix A contains +Inf or -Inf, set INFO parameter +* to the column number, where the first +/-Inf is found plus N, +* and continue the computation. +* + INFO = N + KP1 +* + END IF +* +* ================================================================== +* +* Quick return if possible for the case when the first +* stopping criterion is satisfied, i.e. KMAX = 0. +* + IF( KMAX.EQ.0 ) THEN + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE + DO J = 1, MINMN + TAU( J ) = CZERO + END DO + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* + EPS = DLAMCH('Epsilon') +* +* Adjust ABSTOL +* + IF( ABSTOL.GE.ZERO ) THEN + SAFMIN = DLAMCH('Safe minimum') + ABSTOL = MAX( ABSTOL, TWO*SAFMIN ) + END IF +* +* Adjust RELTOL +* + IF( RELTOL.GE.ZERO ) THEN + RELTOL = MAX( RELTOL, EPS ) + END IF +* +* =================================================================== +* +* JMAX is the maximum index of the column to be factorized, +* which is also limited by the first stopping criterion KMAX. +* + JMAX = MIN( KMAX, MINMN ) +* +* =================================================================== +* +* Quick return if possible for the case when the second or third +* stopping criterion for the whole original matrix is satified, +* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL +* (which is ONE <= RELTOL). +* + IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN +* + K = 0 + MAXC2NRMK = MAXC2NRM + RELMAXC2NRMK = ONE +* + DO J = 1, MINMN + TAU( J ) = CZERO + END DO +* + WORK( 1 ) = DCMPLX( LWKOPT ) + RETURN + END IF +* +* ================================================================== +* Factorize columns +* ================================================================== +* +* Determine the block size. +* + NBMIN = 2 + NX = 0 +* + IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN +* +* Determine when to cross over from blocked to unblocked code. +* (for N less than NX, unblocked code should be used). +* + NX = MAX( 0, ILAENV( IXOVER, 'ZGEQP3RK', ' ', M, N, -1, -1 ) ) +* + IF( NX.LT.MINMN ) THEN +* +* Determine if workspace is large enough for blocked code. +* + IF( LWORK.LT.LWKOPT ) THEN +* +* Not enough workspace to use optimal block size that +* is currently stored in NB. +* Reduce NB and determine the minimum value of NB. +* + NB = ( LWORK-2*N ) / ( N+1 ) + NBMIN = MAX( 2, ILAENV( INBMIN, 'ZGEQP3RK', ' ', M, N, + $ -1, -1 ) ) +* + END IF + END IF + END IF +* +* ================================================================== +* +* DONE is the boolean flag to rerpresent the case when the +* factorization completed in the block factorization routine, +* before the end of the block. +* + DONE = .FALSE. +* +* J is the column index. +* + J = 1 +* +* (1) Use blocked code initially. +* +* JMAXB is the maximum column index of the block, when the +* blocked code is used, is also limited by the first stopping +* criterion KMAX. +* + JMAXB = MIN( KMAX, MINMN - NX ) +* + IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN +* +* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: +* J is the column index of a column block; +* JB is the column block size to pass to block factorization +* routine in a loop step; +* JBF is the number of columns that were actually factorized +* that was returned by the block factorization routine +* in a loop step, JBF <= JB; +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + DO WHILE( J.LE.JMAXB ) +* + JB = MIN( NB, JMAXB-J+1 ) + N_SUB = N-J+1 + IOFFSET = J-1 +* +* Factorize JB columns among the columns A(J:N). +* + CALL ZLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ DONE, JBF, MAXC2NRMK, RELMAXC2NRMK, + $ JPIV( J ), TAU( J ), + $ RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), WORK( JB+1 ), + $ N+NRHS-J+1, IWORK, IINFO ) +* +* Set INFO on the first occurence of Inf. +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + END IF +* + IF( DONE ) THEN +* +* Either the submatrix is zero before the end of the +* column block, or ABSTOL or RELTOL criterion is +* satisfied before the end of the column block, we can +* return from the routine. Perform the following before +* returning: +* a) Set the number of factorized columns K, +* K = IOFFSET + JBF from the last call of blocked +* routine. +* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned +* by the block factorization routine; +* 2) The remaining TAUs are set to ZERO by the +* block factorization routine. +* + K = IOFFSET + JBF +* +* Set INFO on the first occurrence of NaN, NaN takes +* prcedence over Inf. +* + IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* +* Return from the routine. +* + WORK( 1 ) = DCMPLX( LWKOPT ) +* + RETURN +* + END IF +* + J = J + JBF +* + END DO +* + END IF +* +* Use unblocked code to factor the last or only block. +* J = JMAX+1 means we factorized the maximum possible number of +* columns, that is in ELSE clause we need to compute +* the MAXC2NORM and RELMAXC2NORM to return after we processed +* the blocks. +* + IF( J.LE.JMAX ) THEN +* +* N_SUB is the number of columns in the submatrix; +* IOFFSET is the number of rows that should not be factorized. +* + N_SUB = N-J+1 + IOFFSET = J-1 +* + CALL ZLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1, + $ ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA, + $ KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ), + $ TAU( J ), RWORK( J ), RWORK( N+J ), + $ WORK( 1 ), IINFO ) +* +* ABSTOL or RELTOL criterion is satisfied when the number of +* the factorized columns KF is smaller then the number +* of columns JMAX-J+1 supplied to be factorized by the +* unblocked routine, we can return from +* the routine. Perform the following before returning: +* a) Set the number of factorized columns K, +* b) MAXC2NRMK and RELMAXC2NRMK are returned by the +* unblocked factorization routine above. +* + K = J - 1 + KF +* +* Set INFO on the first exception occurence. +* +* Set INFO on the first exception occurence of Inf or NaN, +* (NaN takes precedence over Inf). +* + IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN + INFO = 2*IOFFSET + IINFO + ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN + INFO = IOFFSET + IINFO + END IF +* + ELSE +* +* Compute the return values for blocked code. +* +* Set the number of factorized columns if the unblocked routine +* was not called. +* + K = JMAX +* +* If there exits a residual matrix after the blocked code: +* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the +* residual matrix, otherwise set them to ZERO; +* 2) Set TAU(K+1:MINMN) to ZERO. +* + IF( K.LT.MINMN ) THEN + JMAXC2NRM = K + IDAMAX( N-K, RWORK( K+1 ), 1 ) + MAXC2NRMK = RWORK( JMAXC2NRM ) + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + DO J = K + 1, MINMN + TAU( J ) = CZERO + END DO +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO +* + END IF +* +* END IF( J.LE.JMAX ) THEN +* + END IF +* + WORK( 1 ) = DCMPLX( LWKOPT ) +* + RETURN +* +* End of ZGEQP3RK +* + END diff --git a/lapack-netlib/SRC/zlaqp2rk.f b/lapack-netlib/SRC/zlaqp2rk.f new file mode 100644 index 000000000..f1e9f4899 --- /dev/null +++ b/lapack-netlib/SRC/zlaqp2rk.f @@ -0,0 +1,726 @@ +*> \brief \b ZLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLAQP2RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, +* $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, +* $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, +* $ INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER JPIV( * ) +* DOUBLE PRECISION VN1( * ), VN2( * ) +* COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* $ +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLAQP2RK computes a truncated (rank K) or full rank Householder QR +*> factorization with column pivoting of the complex matrix +*> block A(IOFFSET+1:M,1:N) as +*> +*> A * P(K) = Q(K) * R(K). +*> +*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N) +*> is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides matrix block B +*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] KMAX +*> \verbatim +*> KMAX is INTEGER +*> +*> The first factorization stopping criterion. KMAX >= 0. +*> +*> The maximum number of columns of the matrix A to factorize, +*> i.e. the maximum factorization rank. +*> +*> a) If KMAX >= min(M-IOFFSET,N), then this stopping +*> criterion is not used, factorize columns +*> depending on ABSTOL and RELTOL. +*> +*> b) If KMAX = 0, then this stopping criterion is +*> satisfied on input and the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The second factorization stopping criterion. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The third factorization stopping criterion. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on KMAX and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine ZGEQP3RK. 1 <= KP1 <= N_orig_mat. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine ZGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:K) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(K) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS). +*> The left part A(IOFFSET+1:M,K+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(K)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] K +*> \verbatim +*> K is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N). +*> +*> K also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank K. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank K) to the maximum column 2-norm of the +*> whole original matrix A. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (N-1) +*> Used in ZLARF subroutine to apply an elementary +*> reflector from the left. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step K+1 ( when K columns +*> have been factorized ). +*> +*> On exit: +*> K is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(K+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=K+1, TAU(K+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the factorization +*> step K+1 ( when K columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp2rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, + $ KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK, + $ RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK, + $ INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER JPIV( * ) + DOUBLE PRECISION VN1( * ), VN2( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), + $ CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT, + $ MINMNUPDT + DOUBLE PRECISION HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX*16 AIKK +* .. +* .. External Subroutines .. + EXTERNAL ZLARF, ZLARFG, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DZNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* +* MINMNUPDT is the smallest dimension +* of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which +* contains the submatrices A(IOFFSET+1:M,1:N) and +* B(IOFFSET+1:M,1:NRHS) as column blocks. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + KMAX = MIN( KMAX, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute the factorization, KK is the lomn loop index. +* + DO KK = 1, KMAX +* + I = IOFFSET + KK +* + IF( I.EQ.1 ) THEN +* +* ============================================================ +* +* We are at the first column of the original whole matrix A, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* +* ============================================================ +* + ELSE +* +* ============================================================ +* +* Determine the pivot column in KK-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,KK:N) in step KK. +* RELMAXC2NRMK will be computed later, after somecondition +* checks on MAXC2NRMK. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains NaN, and set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + INFO = K + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* Array TAU(K+1:MINMNFACT) is not set and contains +* undefined elements. +* + RETURN + END IF +* +* ============================================================ +* +* Quick return, if the submatrix A(I:M,KK:N) is +* a zero matrix. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* +* Set K, the number of factorized columns. +* that are not zero. +* + K = KK - 1 + RELMAXC2NRMK = ZERO +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,KK:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + KK - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third stopping criteria. +* NOTE: There is no need to test for ABSTOL >= ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL >= ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. + + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* +* Set K, the number of factorized columns. +* + K = KK - 1 +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. +* + DO J = KK, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,KK:N): +* 1) swap the KK-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) copy the KK-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than KK in the next loop step.) +* 3) Save the pivot interchange with the indices relative to the +* the original matrix A, not the block A(1:M,1:N). +* + IF( KP.NE.KK ) THEN + CALL ZSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 ) + VN1( KP ) = VN1( KK ) + VN2( KP ) = VN2( KK ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( KK ) + JPIV( KK ) = ITEMP + END IF +* +* Generate elementary reflector H(KK) using the column A(I:M,KK), +* if the column has more than one element, otherwise +* the elementary reflector would be an identity matrix, +* and TAU(KK) = CZERO. +* + IF( I.LT.M ) THEN + CALL ZLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1, + $ TAU( KK ) ) + ELSE + TAU( KK ) = CZERO + END IF +* +* Check if TAU(KK) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(KK) for Inf, +* since ZLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by ZLARFG can contain Inf, which requires +* TAU(KK) to contain NaN. Therefore, this case of generating Inf +* by ZLARFG is covered by checking TAU(KK) for NaN. +* + IF( DISNAN( DBLE( TAU(KK) ) ) ) THEN + TAUNAN = DBLE( TAU(KK) ) + ELSE IF( DISNAN( DIMAG( TAU(KK) ) ) ) THEN + TAUNAN = DIMAG( TAU(KK) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( DISNAN( TAUNAN ) ) THEN + K = KK - 1 + INFO = KK +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* Array TAU(KK:MINMNFACT) is not set and contains +* undefined elements, except the first element TAU(KK) = NaN. +* + RETURN + END IF +* +* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. +* ( If M >= N, then at KK = N there is no residual matrix, +* i.e. no columns of A to update, only columns of B. +* If M < N, then at KK = M-IOFFSET, I = M and we have a +* one-row residual matrix in A and the elementary +* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update +* is needed for the residual matrix in A and the +* right-hand-side-matrix in B. +* Therefore, we update only if +* KK < MINMNUPDT = min(M-IOFFSET, N+NRHS) +* condition is satisfied, not only KK < N+NRHS ) +* + IF( KK.LT.MINMNUPDT ) THEN + AIKK = A( I, KK ) + A( I, KK ) = CONE + CALL ZLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1, + $ DCONJG( TAU( KK ) ), A( I, KK+1 ), LDA, + $ WORK( 1 ) ) + A( I, KK ) = AIKK + END IF +* + IF( KK.LT.MINMNFACT ) THEN +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. +* when KK < min(M-IOFFSET, N). +* + DO J = KK + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2 + TEMP = MAX( TEMP, ZERO ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2 .LE. TOL3Z ) THEN +* +* Compute the column 2-norm for the partial +* column A(I+1:M,J) by explicitly computing it, +* and store it in both partial 2-norm vector VN1 +* and exact column 2-norm vector VN2. +* + VN1( J ) = DZNRM2( M-I, A( I+1, J ), 1 ) + VN2( J ) = VN1( J ) +* + ELSE +* +* Update the column 2-norm for the partial +* column A(I+1:M,J) by removing one +* element A(I,J) and store it in partial +* 2-norm vector VN1. +* + VN1( J ) = VN1( J )*SQRT( TEMP ) +* + END IF + END IF + END DO +* + END IF +* +* End factorization loop +* + END DO +* +* If we reached this point, all colunms have been factorized, +* i.e. no condition was triggered to exit the routine. +* Set the number of factorized columns. +* + K = KMAX +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before +* we return. +* + IF( K.LT.MINMNFACT ) THEN +* + JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 ) + MAXC2NRMK = VN1( JMAXC2NRM ) +* + IF( K.EQ.0 ) THEN + RELMAXC2NRMK = ONE + ELSE + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM + END IF +* + ELSE + MAXC2NRMK = ZERO + RELMAXC2NRMK = ZERO + END IF +* +* We reached the end of the loop, i.e. all KMAX columns were +* factorized, set TAUs corresponding to the columns that were +* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. +* + DO J = K + 1, MINMNFACT + TAU( J ) = CZERO + END DO +* + RETURN +* +* End of ZLAQP2RK +* + END diff --git a/lapack-netlib/SRC/zlaqp3rk.f b/lapack-netlib/SRC/zlaqp3rk.f new file mode 100644 index 000000000..7a9fdfd95 --- /dev/null +++ b/lapack-netlib/SRC/zlaqp3rk.f @@ -0,0 +1,947 @@ +*> \brief \b ZLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLAQP3RK + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, +* $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, +* $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, +* $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) +* IMPLICIT NONE +* LOGICAL DONE +* INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, +* $ NB, NRHS +* DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, +* $ RELTOL +* .. +* .. Array Arguments .. +* INTEGER IWORK( * ), JPIV( * ) +* DOUBLE PRECISION VN1( * ), VN2( * ) +* COMPLEX*16 A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLAQP3RK computes a step of truncated QR factorization with column +*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N) +*> by using Level 3 BLAS as +*> +*> A * P(KB) = Q(KB) * R(KB). +*> +*> The routine tries to factorize NB columns from A starting from +*> the row IOFFSET+1 and updates the residual matrix with BLAS 3 +*> xGEMM. The number of actually factorized columns is returned +*> is smaller than NB. +*> +*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized. +*> +*> The routine also overwrites the right-hand-sides B matrix stored +*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B. +*> +*> Cases when the number of factorized columns KB < NB: +*> +*> (1) In some cases, due to catastrophic cancellations, it cannot +*> factorize all NB columns and need to update the residual matrix. +*> Hence, the actual number of factorized columns in the block returned +*> in KB is smaller than NB. The logical DONE is returned as FALSE. +*> The factorization of the whole original matrix A_orig must proceed +*> with the next block. +*> +*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. +*> +*> (3) In case both stopping criteria ABSTOL or RELTOL are not used, +*> and when the residual matrix is a zero matrix in some factorization +*> step KB, the factorization of the whole original matrix A_orig is +*> stopped, the logical DONE is returned as TRUE. The number of +*> factorized columns which is smaller than NB is returned in KB. +*> +*> (4) Whenever NaN is detected in the matrix A or in the array TAU, +*> the factorization of the whole original matrix A_orig is stopped, +*> the logical DONE is returned as TRUE. The number of factorized +*> columns which is smaller than NB is returned in KB. The INFO +*> parameter is set to the column index of the first NaN occurrence. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0 +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrix B. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] IOFFSET +*> \verbatim +*> IOFFSET is INTEGER +*> The number of rows of the matrix A that must be pivoted +*> but not factorized. IOFFSET >= 0. +*> +*> IOFFSET also represents the number of columns of the whole +*> original matrix A_orig that have been factorized +*> in the previous steps. +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> Factorization block size, i.e the number of columns +*> to factorize in the matrix A. 0 <= NB +*> +*> If NB = 0, then the routine exits immediately. +*> This means that the factorization is not performed, +*> the matrices A and B and the arrays TAU, IPIV +*> are not modified. +*> \endverbatim +*> +*> \param[in] ABSTOL +*> \verbatim +*> ABSTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The absolute tolerance (stopping threshold) for +*> maximum column 2-norm of the residual matrix. +*> The algorithm converges (stops the factorization) when +*> the maximum column 2-norm of the residual matrix +*> is less than or equal to ABSTOL. +*> +*> a) If ABSTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and RELTOL. +*> This includes the case ABSTOL = -Inf. +*> +*> b) If 0.0 <= ABSTOL then the input value +*> of ABSTOL is used. +*> \endverbatim +*> +*> \param[in] RELTOL +*> \verbatim +*> RELTOL is DOUBLE PRECISION, cannot be NaN. +*> +*> The tolerance (stopping threshold) for the ratio of the +*> maximum column 2-norm of the residual matrix to the maximum +*> column 2-norm of the original matrix A_orig. The algorithm +*> converges (stops the factorization), when this ratio is +*> less than or equal to RELTOL. +*> +*> a) If RELTOL < 0.0, then this stopping criterion is not +*> used, the routine factorizes columns depending +*> on NB and ABSTOL. +*> This includes the case RELTOL = -Inf. +*> +*> d) If 0.0 <= RELTOL then the input value of RELTOL +*> is used. +*> \endverbatim +*> +*> \param[in] KP1 +*> \verbatim +*> KP1 is INTEGER +*> The index of the column with the maximum 2-norm in +*> the whole original matrix A_orig determined in the +*> main routine ZGEQP3RK. 1 <= KP1 <= N_orig. +*> \endverbatim +*> +*> \param[in] MAXC2NRM +*> \verbatim +*> MAXC2NRM is DOUBLE PRECISION +*> The maximum column 2-norm of the whole original +*> matrix A_orig computed in the main routine ZGEQP3RK. +*> MAXC2NRM >= 0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N+NRHS) +*> On entry: +*> the M-by-N matrix A and M-by-NRHS matrix B, as in +*> +*> N NRHS +*> array_A = M [ mat_A, mat_B ] +*> +*> On exit: +*> 1. The elements in block A(IOFFSET+1:M,1:KB) below +*> the diagonal together with the array TAU represent +*> the orthogonal matrix Q(KB) as a product of elementary +*> reflectors. +*> 2. The upper triangular block of the matrix A stored +*> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. +*> 3. The block of the matrix A stored in A(1:IOFFSET,1:N) +*> has been accordingly pivoted, but not factorized. +*> 4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS). +*> The left part A(IOFFSET+1:M,KB+1:N) of this block +*> contains the residual of the matrix A, and, +*> if NRHS > 0, the right part of the block +*> A(IOFFSET+1:M,N+1:N+NRHS) contains the block of +*> the right-hand-side matrix B. Both these blocks have been +*> updated by multiplication from the left by Q(KB)**H. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] +*> \verbatim +*> DONE is LOGICAL +*> TRUE: a) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to ABSTOL +*> or RELTOL criterion, +*> b) if the factorization completed before processing +*> all min(M-IOFFSET,NB,N) columns due to the +*> residual matrix being a ZERO matrix. +*> c) when NaN was detected in the matrix A +*> or in the array TAU. +*> FALSE: otherwise. +*> \endverbatim +*> +*> \param[out] KB +*> \verbatim +*> KB is INTEGER +*> Factorization rank of the matrix A, i.e. the rank of +*> the factor R, which is the same as the number of non-zero +*> rows of the factor R. 0 <= KB <= min(M-IOFFSET,NB,N). +*> +*> KB also represents the number of non-zero Householder +*> vectors. +*> \endverbatim +*> +*> \param[out] MAXC2NRMK +*> \verbatim +*> MAXC2NRMK is DOUBLE PRECISION +*> The maximum column 2-norm of the residual matrix, +*> when the factorization stopped at rank KB. MAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] RELMAXC2NRMK +*> \verbatim +*> RELMAXC2NRMK is DOUBLE PRECISION +*> The ratio MAXC2NRMK / MAXC2NRM of the maximum column +*> 2-norm of the residual matrix (when the factorization +*> stopped at rank KB) to the maximum column 2-norm of the +*> original matrix A_orig. RELMAXC2NRMK >= 0. +*> \endverbatim +*> +*> \param[out] JPIV +*> \verbatim +*> JPIV is INTEGER array, dimension (N) +*> Column pivot indices, for 1 <= j <= N, column j +*> of the matrix A was interchanged with column JPIV(j). +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N)) +*> The scalar factors of the elementary reflectors. +*> \endverbatim +*> +*> \param[in,out] VN1 +*> \verbatim +*> VN1 is DOUBLE PRECISION array, dimension (N) +*> The vector with the partial column norms. +*> \endverbatim +*> +*> \param[in,out] VN2 +*> \verbatim +*> VN2 is DOUBLE PRECISION array, dimension (N) +*> The vector with the exact column norms. +*> \endverbatim +*> +*> \param[out] AUXV +*> \verbatim +*> AUXV is COMPLEX*16 array, dimension (NB) +*> Auxiliary vector. +*> \endverbatim +*> +*> \param[out] F +*> \verbatim +*> F is COMPLEX*16 array, dimension (LDF,NB) +*> Matrix F**H = L*(Y**H)*A. +*> \endverbatim +*> +*> \param[in] LDF +*> \verbatim +*> LDF is INTEGER +*> The leading dimension of the array F. LDF >= max(1,N+NRHS). +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (N-1). +*> Is a work array. ( IWORK is used to store indices +*> of "bad" columns for norm downdating in the residual +*> matrix ). +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> 1) INFO = 0: successful exit. +*> 2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was +*> detected and the routine stops the computation. +*> The j_1-th column of the matrix A or the j_1-th +*> element of array TAU contains the first occurrence +*> of NaN in the factorization step KB+1 ( when KB columns +*> have been factorized ). +*> +*> On exit: +*> KB is set to the number of +*> factorized columns without +*> exception. +*> MAXC2NRMK is set to NaN. +*> RELMAXC2NRMK is set to NaN. +*> TAU(KB+1:min(M,N)) is not set and contains undefined +*> elements. If j_1=KB+1, TAU(KB+1) +*> may contain NaN. +*> 3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN +*> was detected, but +Inf (or -Inf) was detected and +*> the routine continues the computation until completion. +*> The (j_2-N)-th column of the matrix A contains the first +*> occurrence of +Inf (or -Inf) in the actorization +*> step KB+1 ( when KB columns have been factorized ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup laqp3rk +* +*> \par References: +* ================ +*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996. +*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain. +*> X. Sun, Computer Science Dept., Duke University, USA. +*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA. +*> A BLAS-3 version of the QR factorization with column pivoting. +*> LAPACK Working Note 114 +*> \htmlonly +*> https://www.netlib.org/lapack/lawnspdf/lawn114.pdf +*> \endhtmlonly +*> and in +*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998. +*> \htmlonly +*> https://doi.org/10.1137/S1064827595296732 +*> \endhtmlonly +*> +*> [2] A partial column norm updating strategy developed in 2006. +*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia. +*> On the failure of rank revealing QR factorization software – a case study. +*> LAPACK Working Note 176. +*> \htmlonly +*> http://www.netlib.org/lapack/lawnspdf/lawn176.pdf +*> \endhtmlonly +*> and in +*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages. +*> \htmlonly +*> https://doi.org/10.1145/1377612.1377616 +*> \endhtmlonly +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2023, Igor Kozachenko, James Demmel, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, + $ RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB, + $ MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU, + $ VN1, VN2, AUXV, F, LDF, IWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + LOGICAL DONE + INTEGER INFO, IOFFSET, KB, KP1, LDA, LDF, M, N, + $ NB, NRHS + DOUBLE PRECISION ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK, + $ RELTOL +* .. +* .. Array Arguments .. + INTEGER IWORK( * ), JPIV( * ) + DOUBLE PRECISION VN1( * ), VN2( * ) + COMPLEX*16 A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), + $ CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER ITEMP, J, K, MINMNFACT, MINMNUPDT, + $ LSTICC, KP, I, IF + DOUBLE PRECISION HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z + COMPLEX*16 AIK +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZGEMV, ZLARFG, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT +* .. +* .. External Functions .. + LOGICAL DISNAN + INTEGER IDAMAX + DOUBLE PRECISION DLAMCH, DZNRM2 + EXTERNAL DISNAN, DLAMCH, IDAMAX, DZNRM2 +* .. +* .. Executable Statements .. +* +* Initialize INFO +* + INFO = 0 +* +* MINMNFACT in the smallest dimension of the submatrix +* A(IOFFSET+1:M,1:N) to be factorized. +* + MINMNFACT = MIN( M-IOFFSET, N ) + MINMNUPDT = MIN( M-IOFFSET, N+NRHS ) + NB = MIN( NB, MINMNFACT ) + TOL3Z = SQRT( DLAMCH( 'Epsilon' ) ) + HUGEVAL = DLAMCH( 'Overflow' ) +* +* Compute factorization in a while loop over NB columns, +* K is the column index in the block A(1:M,1:N). +* + K = 0 + LSTICC = 0 + DONE = .FALSE. +* + DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 ) + K = K + 1 + I = IOFFSET + K +* + IF( I.EQ.1 ) THEN +* +* We are at the first column of the original whole matrix A_orig, +* therefore we use the computed KP1 and MAXC2NRM from the +* main routine. +* + KP = KP1 +* + ELSE +* +* Determine the pivot column in K-th step, i.e. the index +* of the column with the maximum 2-norm in the +* submatrix A(I:M,K:N). +* + KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 ) +* +* Determine the maximum column 2-norm and the relative maximum +* column 2-norm of the submatrix A(I:M,K:N) in step K. +* + MAXC2NRMK = VN1( KP ) +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains NaN, set +* INFO parameter to the column number, where the first NaN +* is found and return from the routine. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( DISNAN( MAXC2NRMK ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = KB + KP +* +* Set RELMAXC2NRMK to NaN. +* + RELMAXC2NRMK = MAXC2NRMK +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. + + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* Quick return, if the submatrix A(I:M,K:N) is +* a zero matrix. We need to check it only if the column index +* (same as row index) is larger than 1, since the condition +* for the whole original matrix A_orig is checked in the main +* routine. +* + IF( MAXC2NRMK.EQ.ZERO ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + RELMAXC2NRMK = ZERO +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix is zero and we stop the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* Check if the submatrix A(I:M,K:N) contains Inf, +* set INFO parameter to the column number, where +* the first Inf is found plus N, and continue +* the computation. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN + INFO = N + K - 1 + KP + END IF +* +* ============================================================ +* +* Test for the second and third tolerance stopping criteria. +* NOTE: There is no need to test for ABSTOL.GE.ZERO, since +* MAXC2NRMK is non-negative. Similarly, there is no need +* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is +* non-negative. +* We need to check the condition only if the +* column index (same as row index) of the original whole +* matrix is larger than 1, since the condition for whole +* original matrix is checked in the main routine. +* + RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM +* + IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig; +* + KB = K - 1 + IF = I - 1 +* +* Apply the block reflector to the residual of the +* matrix A and the residual of the right hand sides B, if +* the residual matrix and and/or the residual of the right +* hand sides exist, i.e. if the submatrix +* A(I+1:M,KB+1:N+NRHS) exists. This occurs when +* KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Set TAUs corresponding to the columns that were not +* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, +* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. +* + DO J = K, MINMNFACT + TAU( J ) = CZERO + END DO +* +* Return from the routine. +* + RETURN +* + END IF +* +* ============================================================ +* +* End ELSE of IF(I.EQ.1) +* + END IF +* +* =============================================================== +* +* If the pivot column is not the first column of the +* subblock A(1:M,K:N): +* 1) swap the K-th column and the KP-th pivot column +* in A(1:M,1:N); +* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) +* 3) copy the K-th element into the KP-th element of the partial +* and exact 2-norm vectors VN1 and VN2. (Swap is not needed +* for VN1 and VN2 since we use the element with the index +* larger than K in the next loop step.) +* 4) Save the pivot interchange with the indices relative to the +* the original matrix A_orig, not the block A(1:M,1:N). +* + IF( KP.NE.K ) THEN + CALL ZSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 ) + CALL ZSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF ) + VN1( KP ) = VN1( K ) + VN2( KP ) = VN2( K ) + ITEMP = JPIV( KP ) + JPIV( KP ) = JPIV( K ) + JPIV( K ) = ITEMP + END IF +* +* Apply previous Householder reflectors to column K: +* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. +* + IF( K.GT.1 ) THEN + DO J = 1, K - 1 + F( K, J ) = DCONJG( F( K, J ) ) + END DO + CALL ZGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ), + $ LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 ) + DO J = 1, K - 1 + F( K, J ) = DCONJG( F( K, J ) ) + END DO + END IF +* +* Generate elementary reflector H(k) using the column A(I:M,K). +* + IF( I.LT.M ) THEN + CALL ZLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) ) + ELSE + TAU( K ) = CZERO + END IF +* +* Check if TAU(K) contains NaN, set INFO parameter +* to the column number where NaN is found and return from +* the routine. +* NOTE: There is no need to check TAU(K) for Inf, +* since ZLARFG cannot produce TAU(KK) or Householder vector +* below the diagonal containing Inf. Only BETA on the diagonal, +* returned by ZLARFG can contain Inf, which requires +* TAU(K) to contain NaN. Therefore, this case of generating Inf +* by ZLARFG is covered by checking TAU(K) for NaN. +* + IF( DISNAN( DBLE( TAU(K) ) ) ) THEN + TAUNAN = DBLE( TAU(K) ) + ELSE IF( DISNAN( DIMAG( TAU(K) ) ) ) THEN + TAUNAN = DIMAG( TAU(K) ) + ELSE + TAUNAN = ZERO + END IF +* + IF( DISNAN( TAUNAN ) ) THEN +* + DONE = .TRUE. +* +* Set KB, the number of factorized partial columns +* that are non-zero in each step in the block, +* i.e. the rank of the factor R. +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig. +* + KB = K - 1 + IF = I - 1 + INFO = K +* +* Set MAXC2NRMK and RELMAXC2NRMK to NaN. +* + MAXC2NRMK = TAUNAN + RELMAXC2NRMK = TAUNAN +* +* There is no need to apply the block reflector to the +* residual of the matrix A stored in A(KB+1:M,KB+1:N), +* since the submatrix contains NaN and we stop +* the computation. +* But, we need to apply the block reflector to the residual +* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the +* residual right hand sides exist. This occurs +* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): +* +* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - +* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. +* + IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA ) + END IF +* +* There is no need to recompute the 2-norm of the +* difficult columns, since we stop the factorization. +* +* Array TAU(KF+1:MINMNFACT) is not set and contains +* undefined elements. +* +* Return from the routine. +* + RETURN + END IF +* +* =============================================================== +* + AIK = A( I, K ) + A( I, K ) = CONE +* +* =============================================================== +* +* Compute the current K-th column of F: +* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). +* + IF( K.LT.N+NRHS ) THEN + CALL ZGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K, + $ TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1, + $ CZERO, F( K+1, K ), 1 ) + END IF +* +* 2) Zero out elements above and on the diagonal of the +* column K in matrix F, i.e elements F(1:K,K). +* + DO J = 1, K + F( J, K ) = CZERO + END DO +* +* 3) Incremental updating of the K-th column of F: +* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H +* * A(I:M,K). +* + IF( K.GT.1 ) THEN + CALL ZGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ), + $ A( I, 1 ), LDA, A( I, K ), 1, CZERO, + $ AUXV( 1 ), 1 ) +* + CALL ZGEMV( 'No transpose', N+NRHS, K-1, CONE, + $ F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE, + $ F( 1, K ), 1 ) + END IF +* +* =============================================================== +* +* Update the current I-th row of A: +* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) +* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. +* + IF( K.LT.N+NRHS ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ 1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA, + $ F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA ) + END IF +* + A( I, K ) = AIK +* +* Update the partial column 2-norms for the residual matrix, +* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. +* when K < MINMNFACT = min( M-IOFFSET, N ). +* + IF( K.LT.MINMNFACT ) THEN +* + DO J = K + 1, N + IF( VN1( J ).NE.ZERO ) THEN +* +* NOTE: The following lines follow from the analysis in +* Lapack Working Note 176. +* + TEMP = ABS( A( I, J ) ) / VN1( J ) + TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) + TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2 + IF( TEMP2.LE.TOL3Z ) THEN +* +* At J-index, we have a difficult column for the +* update of the 2-norm. Save the index of the previous +* difficult column in IWORK(J-1). +* NOTE: ILSTCC > 1, threfore we can use IWORK only +* with N-1 elements, where the elements are +* shifted by 1 to the left. +* + IWORK( J-1 ) = LSTICC +* +* Set the index of the last difficult column LSTICC. +* + LSTICC = J +* + ELSE + VN1( J ) = VN1( J )*SQRT( TEMP ) + END IF + END IF + END DO +* + END IF +* +* End of while loop. +* + END DO +* +* Now, afler the loop: +* Set KB, the number of factorized columns in the block; +* Set IF, the number of processed rows in the block, which +* is the same as the number of processed rows in +* the original whole matrix A_orig, IF = IOFFSET + KB. +* + KB = K + IF = I +* +* Apply the block reflector to the residual of the matrix A +* and the residual of the right hand sides B, if the residual +* matrix and and/or the residual of the right hand sides +* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. +* This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ): +* +* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - +* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. +* + IF( KB.LT.MINMNUPDT ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA, + $ F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA ) + END IF +* +* Recompute the 2-norm of the difficult columns. +* Loop over the index of the difficult columns from the largest +* to the smallest index. +* + DO WHILE( LSTICC.GT.0 ) +* +* LSTICC is the index of the last difficult column is greater +* than 1. +* ITEMP is the index of the previous difficult column. +* + ITEMP = IWORK( LSTICC-1 ) +* +* Compute the 2-norm explicilty for the last difficult column and +* save it in the partial and exact 2-norm vectors VN1 and VN2. +* +* NOTE: The computation of VN1( LSTICC ) relies on the fact that +* DZNRM2 does not fail on vectors with norm below the value of +* SQRT(DLAMCH('S')) +* + VN1( LSTICC ) = DZNRM2( M-IF, A( IF+1, LSTICC ), 1 ) + VN2( LSTICC ) = VN1( LSTICC ) +* +* Downdate the index of the last difficult column to +* the index of the previous difficult column. +* + LSTICC = ITEMP +* + END DO +* + RETURN +* +* End of ZLAQP3RK +* + END From 40109c03927e3b5249f6e34dfe9279b15075fd8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Nov 2023 09:50:30 +0100 Subject: [PATCH 417/718] Implement truncated QR with pivoting (Reference-LAPACK PR 891) --- lapack-netlib/TESTING/EIG/alareq.f | 6 +++--- lapack-netlib/TESTING/EIG/alarqg.f | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/alareq.f b/lapack-netlib/TESTING/EIG/alareq.f index 2585a686a..2cbe6db38 100644 --- a/lapack-netlib/TESTING/EIG/alareq.f +++ b/lapack-netlib/TESTING/EIG/alareq.f @@ -28,12 +28,12 @@ *> to evaluate the input line which requested NMATS matrix types for *> PATH. The flow of control is as follows: *> -*> If NMATS = NTYPES then +*> IF NMATS = NTYPES THEN *> DOTYPE(1:NTYPES) = .TRUE. -*> else +*> ELSE *> Read the next input line for NMATS matrix types *> Set DOTYPE(I) = .TRUE. for each valid type I -*> endif +*> END IF *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/EIG/alarqg.f b/lapack-netlib/TESTING/EIG/alarqg.f index 6e2e6e7ec..b9fb88c65 100644 --- a/lapack-netlib/TESTING/EIG/alarqg.f +++ b/lapack-netlib/TESTING/EIG/alarqg.f @@ -28,12 +28,12 @@ *> to evaluate the input line which requested NMATS matrix types for *> PATH. The flow of control is as follows: *> -*> If NMATS = NTYPES then +*> IF NMATS = NTYPES THEN *> DOTYPE(1:NTYPES) = .TRUE. -*> else +*> ELSE *> Read the next input line for NMATS matrix types *> Set DOTYPE(I) = .TRUE. for each valid type I -*> endif +*> END IF *> \endverbatim * * Arguments: From 387830b9d5df8956483d515fce3025502d325dea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Nov 2023 09:53:06 +0100 Subject: [PATCH 418/718] Implement truncated QR with pivoting (Reference-LAPACK PR 891) --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 11 +- lapack-netlib/TESTING/LIN/Makefile | 48 +- lapack-netlib/TESTING/LIN/alaerh.f | 17 + lapack-netlib/TESTING/LIN/alahd.f | 62 +- lapack-netlib/TESTING/LIN/alareq.f | 6 +- lapack-netlib/TESTING/LIN/cchkaa.F | 43 +- lapack-netlib/TESTING/LIN/cchkqp3rk.f | 836 +++++++++++++++++++++++ lapack-netlib/TESTING/LIN/clatb4.f | 121 +++- lapack-netlib/TESTING/LIN/cqpt01.f | 23 +- lapack-netlib/TESTING/LIN/cqrt11.f | 4 +- lapack-netlib/TESTING/LIN/cqrt12.f | 19 +- lapack-netlib/TESTING/LIN/dchkaa.F | 48 +- lapack-netlib/TESTING/LIN/dchkq3.f | 2 +- lapack-netlib/TESTING/LIN/dchkqp3rk.f | 832 ++++++++++++++++++++++ lapack-netlib/TESTING/LIN/dlatb4.f | 123 +++- lapack-netlib/TESTING/LIN/dqpt01.f | 42 +- lapack-netlib/TESTING/LIN/dqrt11.f | 4 +- lapack-netlib/TESTING/LIN/dqrt12.f | 22 +- lapack-netlib/TESTING/LIN/schkaa.F | 48 +- lapack-netlib/TESTING/LIN/schkqp3rk.f | 831 ++++++++++++++++++++++ lapack-netlib/TESTING/LIN/slatb4.f | 121 +++- lapack-netlib/TESTING/LIN/sqpt01.f | 23 +- lapack-netlib/TESTING/LIN/sqrt11.f | 4 +- lapack-netlib/TESTING/LIN/sqrt12.f | 18 +- lapack-netlib/TESTING/LIN/zchkaa.F | 59 +- lapack-netlib/TESTING/LIN/zchkqp3rk.f | 836 +++++++++++++++++++++++ lapack-netlib/TESTING/LIN/zlatb4.f | 121 +++- lapack-netlib/TESTING/LIN/zqpt01.f | 22 +- lapack-netlib/TESTING/LIN/zqrt11.f | 4 +- lapack-netlib/TESTING/LIN/zqrt12.f | 20 +- 30 files changed, 4132 insertions(+), 238 deletions(-) create mode 100644 lapack-netlib/TESTING/LIN/cchkqp3rk.f create mode 100644 lapack-netlib/TESTING/LIN/dchkqp3rk.f create mode 100644 lapack-netlib/TESTING/LIN/schkqp3rk.f create mode 100644 lapack-netlib/TESTING/LIN/zchkqp3rk.f diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 676857a80..e28818c76 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -9,7 +9,7 @@ set(DZLNTST dlaord.f) set(SLINTST schkaa.F schkeq.f schkgb.f schkge.f schkgt.f schklq.f schkpb.f schkpo.f schkps.f schkpp.f - schkpt.f schkq3.f schkql.f schkqr.f schkrq.f + schkpt.f schkq3.f schkqp3rk.f schkql.f schkqr.f schkrq.f schksp.f schksy.f schksy_rook.f schksy_rk.f schksy_aa.f schksy_aa_2stage.f schktb.f schktp.f schktr.f @@ -56,7 +56,7 @@ set(CLINTST cchkaa.F cchkhe.f cchkhe_rook.f cchkhe_rk.f cchkhe_aa.f cchkhe_aa_2stage.f cchkhp.f cchklq.f cchkpb.f - cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkql.f + cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkqp3rk.f cchkql.f cchkqr.f cchkrq.f cchksp.f cchksy.f cchksy_rook.f cchksy_rk.f cchksy_aa.f cchksy_aa_2stage.f cchktb.f @@ -110,7 +110,7 @@ endif() set(DLINTST dchkaa.F dchkeq.f dchkgb.f dchkge.f dchkgt.f dchklq.f dchkpb.f dchkpo.f dchkps.f dchkpp.f - dchkpt.f dchkq3.f dchkql.f dchkqr.f dchkrq.f + dchkpt.f dchkq3.f dchkqp3rk.f dchkql.f dchkqr.f dchkrq.f dchksp.f dchksy.f dchksy_rook.f dchksy_rk.f dchksy_aa.f dchksy_aa_2stage.f dchktb.f dchktp.f dchktr.f @@ -158,7 +158,7 @@ set(ZLINTST zchkaa.F zchkhe.f zchkhe_rook.f zchkhe_rk.f zchkhe_aa.f zchkhe_aa_2stage.f zchkhp.f zchklq.f zchkpb.f - zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkql.f + zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkqp3rk.f zchkql.f zchkqr.f zchkrq.f zchksp.f zchksy.f zchksy_rook.f zchksy_rk.f zchksy_aa.f zchksy_aa_2stage.f zchktb.f @@ -239,8 +239,7 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) -#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) + target_link_libraries(${name} ${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() if(BUILD_SINGLE) diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 64abc4dba..46e096c2f 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -45,7 +45,7 @@ DZLNTST = dlaord.o SLINTST = schkaa.o \ schkeq.o schkgb.o schkge.o schkgt.o \ schklq.o schkpb.o schkpo.o schkps.o schkpp.o \ - schkpt.o schkq3.o schkql.o schkqr.o schkrq.o \ + schkpt.o schkq3.o schkqp3rk.o schkql.o schkqr.o schkrq.o \ schksp.o schksy.o schksy_rook.o schksy_rk.o \ schksy_aa.o schksy_aa_2stage.o schktb.o schktp.o schktr.o \ schktz.o \ @@ -89,7 +89,7 @@ CLINTST = cchkaa.o \ cchkeq.o cchkgb.o cchkge.o cchkgt.o \ cchkhe.o cchkhe_rook.o cchkhe_rk.o \ cchkhe_aa.o cchkhe_aa_2stage.o cchkhp.o cchklq.o cchkpb.o \ - cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkql.o \ + cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkqp3rk.o cchkql.o \ cchkqr.o cchkrq.o cchksp.o cchksy.o cchksy_rook.o cchksy_rk.o \ cchksy_aa.o cchksy_aa_2stage.o cchktb.o \ cchktp.o cchktr.o cchktz.o \ @@ -137,7 +137,7 @@ endif DLINTST = dchkaa.o \ dchkeq.o dchkgb.o dchkge.o dchkgt.o \ dchklq.o dchkpb.o dchkpo.o dchkps.o dchkpp.o \ - dchkpt.o dchkq3.o dchkql.o dchkqr.o dchkrq.o \ + dchkpt.o dchkq3.o dchkqp3rk.o dchkql.o dchkqr.o dchkrq.o \ dchksp.o dchksy.o dchksy_rook.o dchksy_rk.o \ dchksy_aa.o dchksy_aa_2stage.o dchktb.o dchktp.o dchktr.o \ dchktz.o \ @@ -182,7 +182,7 @@ ZLINTST = zchkaa.o \ zchkeq.o zchkgb.o zchkge.o zchkgt.o \ zchkhe.o zchkhe_rook.o zchkhe_rk.o zchkhe_aa.o zchkhe_aa_2stage.o \ zchkhp.o zchklq.o zchkpb.o \ - zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkql.o \ + zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkqp3rk.o zchkql.o \ zchkqr.o zchkrq.o zchksp.o zchksy.o zchksy_rook.o zchksy_rk.o \ zchksy_aa.o zchksy_aa_2stage.o zchktb.o \ zchktp.o zchktr.o zchktz.o \ @@ -269,35 +269,35 @@ proto-double: xlintstds xlintstrfd proto-complex: xlintstrfc proto-complex16: xlintstzc xlintstrfz -xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstds: $(DSLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstds: $(DSLINTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstzc: $(ZCLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstzc: $(ZCLINTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfs: $(SLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstrfs: $(SLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfd: $(DLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstrfd: $(DLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfc: $(CLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstrfc: $(CLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfz: $(ZLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstrfz: $(ZLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ $(ALINTST): $(FRC) $(SCLNTST): $(FRC) diff --git a/lapack-netlib/TESTING/LIN/alaerh.f b/lapack-netlib/TESTING/LIN/alaerh.f index 1845888a6..6c8a47f1e 100644 --- a/lapack-netlib/TESTING/LIN/alaerh.f +++ b/lapack-netlib/TESTING/LIN/alaerh.f @@ -797,6 +797,18 @@ WRITE( NOUT, FMT = 9978 ) $ SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT END IF +* + ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN +* +* xQK: truncated QR factorization with pivoting +* + IF( LSAMEN( 7, SUBNAM( 2: 8 ), 'GEQP3RK' ) ) THEN + WRITE( NOUT, FMT = 9930 ) + $ SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, KL, N5, IMAT + ELSE IF( LSAMEN( 5, SUBNAM( 2: 6 ), 'LATMS' ) ) THEN + WRITE( NOUT, FMT = 9978 ) + $ SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT + END IF * ELSE IF( LSAMEN( 2, P2, 'LQ' ) ) THEN * @@ -1147,6 +1159,11 @@ * What we do next * 9949 FORMAT( ' ==> Doing only the condition estimate for this case' ) +* +* SUBNAM, INFO, M, N, NB, IMAT +* + 9930 FORMAT( ' *** Error code from ', A, '=', I5, / ' ==> M =', I5, + $ ', N =', I5, ', NX =', I5, ', NB =', I4, ', type ', I2 ) * RETURN * diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f index dd75394b3..8f966c584 100644 --- a/lapack-netlib/TESTING/LIN/alahd.f +++ b/lapack-netlib/TESTING/LIN/alahd.f @@ -584,13 +584,27 @@ * * QR decomposition with column pivoting * - WRITE( IOUNIT, FMT = 9986 )PATH + WRITE( IOUNIT, FMT = 8006 )PATH WRITE( IOUNIT, FMT = 9969 ) WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' ) WRITE( IOUNIT, FMT = 9940 )1 WRITE( IOUNIT, FMT = 9939 )2 WRITE( IOUNIT, FMT = 9938 )3 WRITE( IOUNIT, FMT = '( '' Messages:'' )' ) +* + ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN +* +* truncated QR decomposition with column pivoting +* + WRITE( IOUNIT, FMT = 8006 )PATH + WRITE( IOUNIT, FMT = 9871 ) + WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' ) + WRITE( IOUNIT, FMT = 8060 )1 + WRITE( IOUNIT, FMT = 8061 )2 + WRITE( IOUNIT, FMT = 8062 )3 + WRITE( IOUNIT, FMT = 8063 )4 + WRITE( IOUNIT, FMT = 8064 )5 + WRITE( IOUNIT, FMT = '( '' Messages:'' )' ) * ELSE IF( LSAMEN( 2, P2, 'TZ' ) ) THEN * @@ -779,6 +793,8 @@ $ 'tall-skinny or short-wide matrices' ) 8005 FORMAT( / 1X, A3, ': Householder reconstruction from TSQR', $ ' factorization output ', /,' for tall-skinny matrices.' ) + 8006 FORMAT( / 1X, A3, ': truncated QR factorization', + $ ' with column pivoting' ) * * GE matrix types * @@ -922,6 +938,36 @@ $ / 4X, '3. Geometric distribution', 10X, $ '6. Every second column fixed' ) * +* QK matrix types +* + 9871 FORMAT( 4X, ' 1. Zero matrix', / + $ 4X, ' 2. Random, Diagonal, CNDNUM = 2', / + $ 4X, ' 3. Random, Upper triangular, CNDNUM = 2', / + $ 4X, ' 4. Random, Lower triangular, CNDNUM = 2', / + $ 4X, ' 5. Random, First column is zero, CNDNUM = 2', / + $ 4X, ' 6. Random, Last MINMN column is zero, CNDNUM = 2', / + $ 4X, ' 7. Random, Last N column is zero, CNDNUM = 2', / + $ 4X, ' 8. Random, Middle column in MINMN is zero,', + $ ' CNDNUM = 2', / + $ 4X, ' 9. Random, First half of MINMN columns are zero,', + $ ' CNDNUM = 2', / + $ 4X, '10. Random, Last columns are zero starting from', + $ ' MINMN/2+1, CNDNUM = 2', / + $ 4X, '11. Random, Half MINMN columns in the middle are', + $ ' zero starting from MINMN/2-(MINMN/2)/2+1,' + $ ' CNDNUM = 2', / + $ 4X, '12. Random, Odd columns are ZERO, CNDNUM = 2', / + $ 4X, '13. Random, Even columns are ZERO, CNDNUM = 2', / + $ 4X, '14. Random, CNDNUM = 2', / + $ 4X, '15. Random, CNDNUM = sqrt(0.1/EPS)', / + $ 4X, '16. Random, CNDNUM = 0.1/EPS', / + $ 4X, '17. Random, CNDNUM = 0.1/EPS,', + $ ' one small singular value S(N)=1/CNDNUM', / + $ 4X, '18. Random, CNDNUM = 2, scaled near underflow,', + $ ' NORM = SMALL = SAFMIN', / + $ 4X, '19. Random, CNDNUM = 2, scaled near overflow,', + $ ' NORM = LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )' ) +* * TZ matrix types * 9968 FORMAT( ' Matrix types (2-3 have condition 1/EPS):', / 4X, @@ -1030,9 +1076,8 @@ $ ' * norm(C) * EPS )' ) 9940 FORMAT( 3X, I2, ': norm(svd(A) - svd(R)) / ', $ '( M * norm(svd(R)) * EPS )' ) - 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R ) / ( M * norm(A) * EPS )' - $ ) - 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q ) / ( M * EPS )' ) + 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R ) / ( M * norm(A) * EPS )') + 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q ) / ( M * EPS )' ) 9937 FORMAT( 3X, I2, ': norm( A - R*Q ) / ( M * norm(A) * EPS )' $ ) 9935 FORMAT( 3X, I2, ': norm( B - A * X ) / ', @@ -1105,6 +1150,15 @@ 8054 FORMAT(3X,I2,': norm( C*Q - C*Q ) / ( M * norm(C) * EPS )' ) 8055 FORMAT(3X,I2,': norm( C*Q'' - C*Q'' ) / ( M * norm(C) * EPS )') + 8060 FORMAT( 3X, I2, ': 2-norm(svd(A) - svd(R)) / ', + $ '( max(M,N) * 2-norm(svd(R)) * EPS )' ) + 8061 FORMAT( 3X, I2, ': 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A)', + $ ' * EPS )') + 8062 FORMAT( 3X, I2, ': 1-norm( I - Q''*Q ) / ( M * EPS )' ) + 8063 FORMAT( 3X, I2, ': Returns 1.0D+100, if abs(R(K+1,K+1))', + $ ' > abs(R(K,K)), where K=1:KFACT-1' ) + 8064 FORMAT( 3X, I2, ': 1-norm(Q**T * B - Q**T * B ) / ( M * EPS )') + * RETURN * diff --git a/lapack-netlib/TESTING/LIN/alareq.f b/lapack-netlib/TESTING/LIN/alareq.f index db18775eb..3f057fa48 100644 --- a/lapack-netlib/TESTING/LIN/alareq.f +++ b/lapack-netlib/TESTING/LIN/alareq.f @@ -28,12 +28,12 @@ *> to evaluate the input line which requested NMATS matrix types for *> PATH. The flow of control is as follows: *> -*> If NMATS = NTYPES then +*> IF NMATS = NTYPES THEN *> DOTYPE(1:NTYPES) = .TRUE. -*> else +*> ELSE *> Read the next input line for NMATS matrix types *> Set DOTYPE(I) = .TRUE. for each valid type I -*> endif +*> END IF *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/LIN/cchkaa.F b/lapack-netlib/TESTING/LIN/cchkaa.F index ec1534ed4..474454a51 100644 --- a/lapack-netlib/TESTING/LIN/cchkaa.F +++ b/lapack-netlib/TESTING/LIN/cchkaa.F @@ -69,6 +69,7 @@ *> CLQ 8 List types on next line if 0 < NTYPES < 8 *> CQL 8 List types on next line if 0 < NTYPES < 8 *> CQP 6 List types on next line if 0 < NTYPES < 6 +*> ZQK 19 List types on next line if 0 < NTYPES < 19 *> CTZ 3 List types on next line if 0 < NTYPES < 3 *> CLS 6 List types on next line if 0 < NTYPES < 6 *> CEQ @@ -153,12 +154,11 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL S( 2*NMAX ) - COMPLEX E( NMAX ) * .. * .. Allocatable Arrays .. INTEGER AllocateStatus - REAL, DIMENSION(:), ALLOCATABLE :: RWORK + REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX, DIMENSION(:), ALLOCATABLE :: E COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. @@ -170,14 +170,14 @@ EXTERNAL ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE, $ CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP, $ CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS, - $ CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ, - $ CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK, - $ CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, CCHKTZ, - $ CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK, - $ CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB, - $ CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY, - $ CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER, - $ CCHKQRT, CCHKQRTP + $ CCHKPP, CCHKPT, CCHKQ3, CCHKQP3RK, CCHKQL, + $ CCHKQR, CCHKRQ, CCHKSP, CCHKSY, CCHKSY_ROOK, + $ CCHKSY_RK, CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, + $ CCHKTZ, CDRVGB, CDRVGE, CDRVGT, CDRVHE, + $ CDRVHE_ROOK, CDRVHE_RK, CDRVHE_AA, CDRVHP, + $ CDRVLS, CDRVPB, CDRVPO, CDRVPP, CDRVPT, CDRVSP, + $ CDRVSY, CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, + $ ILAVER, CCHKQRT, CCHKQRTP * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -203,6 +203,10 @@ IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. @@ -1109,6 +1113,23 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ S( 1 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF * ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/cchkqp3rk.f b/lapack-netlib/TESTING/LIN/cchkqp3rk.f new file mode 100644 index 000000000..79d6add72 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cchkqp3rk.f @@ -0,0 +1,836 @@ +*> \brief \b CCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, RWORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNB, NOUT +* REAL THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ), +* $ NXVAL( * ) +* REAL S( * ), RWORK( * ) +* COMPLEX A( * ), COPYA( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CCHKQP3RK tests CGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is COMPLEX array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is COMPLEX array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is COMPLEX array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is COMPLEX array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is REAL array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension +*> (max(M*max(M,N) + 4*min(M,N) + max(M,N))) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is REAL array, dimension (4*NMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex_lin +* +* ===================================================================== + SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, RWORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + REAL THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + REAL S( * ), RWORK( * ) + COMPLEX A( * ), COPYA( * ), B( * ), COPYB( * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + REAL ONE, ZERO, BIGNUM + COMPLEX CONE, CZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0, + $ CZERO = ( 0.0E+0, 0.0E+0 ), + $ CONE = ( 1.0E+0, 0.0E+0 ), + $ BIGNUM = 1.0E+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + REAL ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + REAL RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE + EXTERNAL SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, SLAORD, ICOPY, CAXPY, + $ XLAENV, CGEQP3RK, CLACPY, CLASET, CLATB4, + $ CLATMS, CUNMQR, CSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, MOD, REAL +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT, CUNMQR_LWORK +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Complex precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = SLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with CLATB4 and generate +* M-by-NRHS B matrix with CLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL CLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'CLATMS' + CALL CLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) +* +* Check error code from CLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL CLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with DLATB4 and generate a test +* matrix with CLATMS. +* + CALL CLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'CLATMS' + CALL CLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from CLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL SLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL CLASET( 'Full', M, NB_ZERO, CZERO, CZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL CLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'CLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL CLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from CLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL CSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL CSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + CALL SLAORD( 'Decreasing', MINMNB_GEN, S, 1 ) + + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for DGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL CLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL CLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL CLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOl = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute CGEQP3RK factorization of A. +* + SRNAMT = 'CGEQP3RK' + CALL CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, RWORK, IWORK( 2*N+1 ), + $ INFO ) +* +* Check error code from CGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'CGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* + IF( KFACT.EQ.MINMN ) THEN +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + RESULT( 1 ) = CQRT12( M, N, A, LDA, S, WORK, + $ LWORK , RWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF + +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = CQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = CQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 +* + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL CUNMQR( 'Left', 'Conjugate transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL CAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ CLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of CCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/clatb4.f b/lapack-netlib/TESTING/LIN/clatb4.f index eeb0f03a9..233a8631a 100644 --- a/lapack-netlib/TESTING/LIN/clatb4.f +++ b/lapack-netlib/TESTING/LIN/clatb4.f @@ -154,9 +154,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL SLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -174,11 +171,6 @@ BADC1 = SQRT( BADC2 ) SMALL = SLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL SLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -233,6 +225,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -517,17 +613,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/cqpt01.f b/lapack-netlib/TESTING/LIN/cqpt01.f index 79fc2dc66..149c5bb7c 100644 --- a/lapack-netlib/TESTING/LIN/cqpt01.f +++ b/lapack-netlib/TESTING/LIN/cqpt01.f @@ -33,7 +33,8 @@ *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ) +*> where || . || is matrix one norm. *> \endverbatim * * Arguments: @@ -172,28 +173,28 @@ * NORMA = CLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO + DO J = K + 1, N CALL CCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL CUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * * Compare i-th column of QR and jpvt(i)-th column of A * CALL CAXPY( M, CMPLX( -ONE ), A( 1, JPVT( J ) ), 1, $ WORK( ( J-1 )*M+1 ), 1 ) - 50 CONTINUE + END DO * CQPT01 = CLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/cqrt11.f b/lapack-netlib/TESTING/LIN/cqrt11.f index 494d5e9cd..a52084973 100644 --- a/lapack-netlib/TESTING/LIN/cqrt11.f +++ b/lapack-netlib/TESTING/LIN/cqrt11.f @@ -157,9 +157,9 @@ CALL CUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU, $ WORK, M, WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * CQRT11 = CLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/cqrt12.f b/lapack-netlib/TESTING/LIN/cqrt12.f index 4c29423ae..0df2d833b 100644 --- a/lapack-netlib/TESTING/LIN/cqrt12.f +++ b/lapack-netlib/TESTING/LIN/cqrt12.f @@ -28,7 +28,7 @@ *> CQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues -s ||/( ||s||*eps*max(M,N) ) *> \endverbatim * * Arguments: @@ -125,8 +125,8 @@ EXTERNAL CLANGE, SASUM, SLAMCH, SNRM2 * .. * .. External Subroutines .. - EXTERNAL CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLABAD, - $ SLASCL, XERBLA + EXTERNAL CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLASCL, + $ XERBLA * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN, REAL @@ -153,17 +153,16 @@ * Copy upper triangle of A into work * CALL CLASET( 'Full', M, N, CMPLX( ZERO ), CMPLX( ZERO ), WORK, M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -207,9 +206,9 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN RWORK( I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work diff --git a/lapack-netlib/TESTING/LIN/dchkaa.F b/lapack-netlib/TESTING/LIN/dchkaa.F index ef9d7808c..74077eb94 100644 --- a/lapack-netlib/TESTING/LIN/dchkaa.F +++ b/lapack-netlib/TESTING/LIN/dchkaa.F @@ -63,6 +63,7 @@ *> DLQ 8 List types on next line if 0 < NTYPES < 8 *> DQL 8 List types on next line if 0 < NTYPES < 8 *> DQP 6 List types on next line if 0 < NTYPES < 6 +*> DQK 19 List types on next line if 0 < NTYPES < 19 *> DTZ 3 List types on next line if 0 < NTYPES < 3 *> DLS 6 List types on next line if 0 < NTYPES < 6 *> DEQ @@ -149,12 +150,12 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION E( NMAX ), S( 2*NMAX ) * .. * .. Allocatable Arrays .. INTEGER AllocateStatus - DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK - DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: E + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -164,13 +165,13 @@ * .. External Subroutines .. EXTERNAL ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ, $ DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP, - $ DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP, - $ DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA, - $ DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE, - $ DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT, - $ DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK, - $ DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP, - $ DCHKLQT,DCHKTSQR + $ DCHKPT, DCHKQ3, DCHKQP3RK, DCHKQL, DCHKQR, + $ DCHKRQ, DCHKSP, DCHKSY, DCHKSY_ROOK, DCHKSY_RK, + $ DCHKSY_AA, DCHKTB, DCHKTP, DCHKTR, DCHKTZ, + $ DDRVGB, DDRVGE, DDRVGT, DDRVLS, DDRVPB, DDRVPO, + $ DDRVPP, DDRVPT, DDRVSP, DDRVSY, DDRVSY_ROOK, + $ DDRVSY_RK, DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, + $ DCHKQRTP, DCHKLQT,DCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -197,6 +198,10 @@ IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * @@ -919,9 +924,26 @@ CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) * IF( TSTCHK ) THEN - CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, IWORK, NOUT ) + CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, + $ NXVAL, THRESH, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 3 ), WORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), B( 1, 4 ), + $ WORK, IWORK, NOUT ) ELSE WRITE( NOUT, FMT = 9989 )PATH END IF diff --git a/lapack-netlib/TESTING/LIN/dchkq3.f b/lapack-netlib/TESTING/LIN/dchkq3.f index 1fdf07252..494008fa8 100644 --- a/lapack-netlib/TESTING/LIN/dchkq3.f +++ b/lapack-netlib/TESTING/LIN/dchkq3.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> DCHKQ3 tests DGEQP3. +*> DCHKQ3 tests DGEQP3. *> \endverbatim * * Arguments: diff --git a/lapack-netlib/TESTING/LIN/dchkqp3rk.f b/lapack-netlib/TESTING/LIN/dchkqp3rk.f new file mode 100644 index 000000000..434d2067e --- /dev/null +++ b/lapack-netlib/TESTING/LIN/dchkqp3rk.f @@ -0,0 +1,832 @@ +*> \brief \b DCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNS, NNB, NOUT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ), +* $ NVAL( * ), NXVAL( * ) +* DOUBLE PRECISION A( * ), COPYA( * ), B( * ), COPYB( * ), +* $ S( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DCHKQP3RK tests DGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is DOUBLE PRECISION array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is DOUBLE PRECISION array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is DOUBLE PRECISION array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension +*> (MMAX*NMAX + 4*NMAX + MMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup double_lin +* +* ===================================================================== + SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + DOUBLE PRECISION A( * ), COPYA( * ), B( * ), COPYB( * ), + $ S( * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + DOUBLE PRECISION ONE, ZERO, BIGNUM + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0, + $ BIGNUM = 1.0D+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + DOUBLE PRECISION ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + DOUBLE PRECISION RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE, + $ DLAPY2 + EXTERNAL DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, DAXPY, DGEQP3RK, + $ DLACPY, DLAORD, DLASET, DLATB4, DLATMS, + $ DORMQR, DSWAP, ICOPY, XLAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, MIN, MOD +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Double precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = DLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with DLATB4 and generate +* M-by-NRHS B matrix with DLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL DLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'DLATMS' + CALL DLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) + + +* +* Check error code from DLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL DLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with DLATB4 and generate a test +* matrix with DLATMS. +* + CALL DLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'DLATMS' + CALL DLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from DLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL DLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL DLASET( 'Full', M, NB_ZERO, ZERO, ZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL DLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'DLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL DLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from DLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL DSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL DSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for DGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL DLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL DLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL DLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOL = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute DGEQP3RK factorization of A. +* + SRNAMT = 'DGEQP3RK' + CALL DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, IWORK( 2*N+1 ), INFO ) +* +* Check error code from DGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'DGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + IF( KFACT.EQ.MINMN ) THEN +* + RESULT( 1 ) = DQRT12( M, N, A, LDA, S, WORK, + $ LWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF +* +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = DQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = DQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 + + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL DORMQR( 'Left', 'Transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL DAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ DLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of DCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/dlatb4.f b/lapack-netlib/TESTING/LIN/dlatb4.f index 8825d13e7..f3bccd45b 100644 --- a/lapack-netlib/TESTING/LIN/dlatb4.f +++ b/lapack-netlib/TESTING/LIN/dlatb4.f @@ -133,7 +133,7 @@ * * .. Parameters .. DOUBLE PRECISION SHRINK, TENTH - PARAMETER ( SHRINK = 0.25D0, TENTH = 0.1D+0 ) + PARAMETER ( SHRINK = 0.25D+0, TENTH = 0.1D+0 ) DOUBLE PRECISION ONE PARAMETER ( ONE = 1.0D+0 ) DOUBLE PRECISION TWO @@ -153,9 +153,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL DLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -173,11 +170,6 @@ BADC1 = SQRT( BADC2 ) SMALL = DLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL DLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -232,6 +224,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -518,17 +614,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/dqpt01.f b/lapack-netlib/TESTING/LIN/dqpt01.f index 8efbdc774..af3f5dd36 100644 --- a/lapack-netlib/TESTING/LIN/dqpt01.f +++ b/lapack-netlib/TESTING/LIN/dqpt01.f @@ -28,12 +28,13 @@ *> *> DQPT01 tests the QR-factorization with pivoting of a matrix A. The *> array AF contains the (possibly partial) QR-factorization of A, where -*> the upper triangle of AF(1:k,1:k) is a partial triangular factor, -*> the entries below the diagonal in the first k columns are the +*> the upper triangle of AF(1:K,1:K) is a partial triangular factor, +*> the entries below the diagonal in the first K columns are the *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ), +*> where || . || is matrix one norm. *> \endverbatim * * Arguments: @@ -172,28 +173,41 @@ * NORMA = DLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K +* +* Copy the upper triangular part of the factor R stored +* in AF(1:K,1:K) into the work array WORK. +* + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO +* +* Zero out the elements below the diagonal in the work array. +* + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO +* +* Copy columns (K+1,N) from AF into the work array WORK. +* AF(1:K,K+1:N) contains the rectangular block of the upper trapezoidal +* factor R, AF(K+1:M,K+1:N) contains the partially updated residual +* matrix of R. +* + DO J = K + 1, N CALL DCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL DORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * -* Compare i-th column of QR and jpvt(i)-th column of A +* Compare J-th column of QR and JPVT(J)-th column of A. * CALL DAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ), $ 1 ) - 50 CONTINUE + END DO * DQPT01 = DLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/dqrt11.f b/lapack-netlib/TESTING/LIN/dqrt11.f index 33c7fab37..38bbeb822 100644 --- a/lapack-netlib/TESTING/LIN/dqrt11.f +++ b/lapack-netlib/TESTING/LIN/dqrt11.f @@ -157,9 +157,9 @@ CALL DORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M, $ WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * DQRT11 = DLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/dqrt12.f b/lapack-netlib/TESTING/LIN/dqrt12.f index 278e01bf0..b8a124c59 100644 --- a/lapack-netlib/TESTING/LIN/dqrt12.f +++ b/lapack-netlib/TESTING/LIN/dqrt12.f @@ -26,7 +26,7 @@ *> DQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues - s ||/(||s||*eps*max(M,N)) *> \endverbatim * * Arguments: @@ -113,8 +113,7 @@ EXTERNAL DASUM, DLAMCH, DLANGE, DNRM2 * .. * .. External Subroutines .. - EXTERNAL DAXPY, DBDSQR, DGEBD2, DLABAD, DLASCL, DLASET, - $ XERBLA + EXTERNAL DAXPY, DBDSQR, DGEBD2, DLASCL, DLASET, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN @@ -145,17 +144,16 @@ * Copy upper triangle of A into work * CALL DLASET( 'Full', M, N, ZERO, ZERO, WORK, M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -199,16 +197,18 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN WORK( M*N+I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work * CALL DAXPY( MN, -ONE, S, 1, WORK( M*N+1 ), 1 ) +* DQRT12 = DASUM( MN, WORK( M*N+1 ), 1 ) / - $ ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) ) + $ ( DLAMCH('Epsilon') * DBLE( MAX( M, N ) ) ) +* IF( NRMSVL.NE.ZERO ) $ DQRT12 = DQRT12 / NRMSVL * diff --git a/lapack-netlib/TESTING/LIN/schkaa.F b/lapack-netlib/TESTING/LIN/schkaa.F index a5b826d06..2b9f2ea45 100644 --- a/lapack-netlib/TESTING/LIN/schkaa.F +++ b/lapack-netlib/TESTING/LIN/schkaa.F @@ -63,6 +63,7 @@ *> SLQ 8 List types on next line if 0 < NTYPES < 8 *> SQL 8 List types on next line if 0 < NTYPES < 8 *> SQP 6 List types on next line if 0 < NTYPES < 6 +*> DQK 19 List types on next line if 0 < NTYPES < 19 *> STZ 3 List types on next line if 0 < NTYPES < 3 *> SLS 6 List types on next line if 0 < NTYPES < 6 *> SEQ @@ -147,11 +148,11 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL E( NMAX ), S( 2*NMAX ) * .. * .. Allocatable Arrays .. INTEGER AllocateStatus - REAL, DIMENSION(:), ALLOCATABLE :: RWORK + REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S + REAL, DIMENSION(:), ALLOCATABLE :: E REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. @@ -162,13 +163,13 @@ * .. External Subroutines .. EXTERNAL ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ, $ SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP, - $ SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP, - $ SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA, - $ SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE, - $ SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT, - $ SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK, - $ SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP, - $ SCHKLQT, SCHKTSQR + $ SCHKPT, SCHKQ3, SCHKQP3RK, SCHKQL, SCHKQR, + $ SCHKRQ, SCHKSP, SCHKSY, SCHKSY_ROOK, SCHKSY_RK, + $ SCHKSY_AA, SCHKTB, SCHKTP, SCHKTR, SCHKTZ, + $ SDRVGB, SDRVGE, SDRVGT, SDRVLS, SDRVPB, SDRVPO, + $ SDRVPP, SDRVPT, SDRVSP, SDRVSY, SDRVSY_ROOK, + $ SDRVSY_RK, SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, + $ SCHKQRTP, SCHKLQT, SCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -188,13 +189,17 @@ * .. * .. Allocate memory dynamically .. * - ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus ) + ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. @@ -920,6 +925,23 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), B( 1, 4 ), + $ WORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF * ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/schkqp3rk.f b/lapack-netlib/TESTING/LIN/schkqp3rk.f new file mode 100644 index 000000000..36cf9370e --- /dev/null +++ b/lapack-netlib/TESTING/LIN/schkqp3rk.f @@ -0,0 +1,831 @@ +*> \brief \b SCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNS, NNB, NOUT +* REAL THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ), +* $ NVAL( * ), NXVAL( * ) +* REAL A( * ), COPYA( * ), B( * ), COPYB( * ), +* $ S( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SCHKQP3RK tests SGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is REAL array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is REAL array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is REAL array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is REAL array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is REAL array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is REAL array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension +*> (MMAX*NMAX + 4*NMAX + MMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup single_lin +* +* ===================================================================== + SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + REAL THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + REAL A( * ), COPYA( * ), B( * ), COPYB( * ), + $ S( * ), TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + REAL ONE, ZERO, BIGNUM + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0, + $ BIGNUM = 1.0E+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + REAL ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + REAL RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE + EXTERNAL SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, SAXPY, SGEQP3RK, + $ SLACPY, SLAORD, SLASET, SLATB4, SLATMS, + $ SORMQR, SSWAP, ICOPY, XLAENV +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN, MOD, REAL +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Single precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = SLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with SLATB4 and generate +* M-by-NRHS B matrix with SLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL SLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'SLATMS' + CALL SLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) + + +* +* Check error code from SLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL SLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with SLATB4 and generate a test +* matrix with SLATMS. +* + CALL SLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'SLATMS' + CALL SLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from SLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL SLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL SLASET( 'Full', M, NB_ZERO, ZERO, ZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL SLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'SLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL SLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from SLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL SSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL SSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for SGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL SLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL SLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL SLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOL = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute SGEQP3RK factorization of A. +* + SRNAMT = 'SGEQP3RK' + CALL SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, IWORK( 2*N+1 ), INFO ) +* +* Check error code from SGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'SGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + IF( KFACT.EQ.MINMN ) THEN +* + RESULT( 1 ) = SQRT12( M, N, A, LDA, S, WORK, + $ LWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF +* +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = SQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = SQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 + + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL SORMQR( 'Left', 'Transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL SAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ SLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of SCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/slatb4.f b/lapack-netlib/TESTING/LIN/slatb4.f index 94d29db40..72a310727 100644 --- a/lapack-netlib/TESTING/LIN/slatb4.f +++ b/lapack-netlib/TESTING/LIN/slatb4.f @@ -153,9 +153,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL SLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -173,11 +170,6 @@ BADC1 = SQRT( BADC2 ) SMALL = SLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL SLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -232,6 +224,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -518,17 +614,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/sqpt01.f b/lapack-netlib/TESTING/LIN/sqpt01.f index de0c80e53..f53686a65 100644 --- a/lapack-netlib/TESTING/LIN/sqpt01.f +++ b/lapack-netlib/TESTING/LIN/sqpt01.f @@ -33,7 +33,8 @@ *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ) +*> where || . || is matrix one norm. *> \endverbatim * * Arguments: @@ -172,28 +173,28 @@ * NORMA = SLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO + DO J = K + 1, N CALL SCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL SORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * * Compare i-th column of QR and jpvt(i)-th column of A * CALL SAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ), $ 1 ) - 50 CONTINUE + END DO * SQPT01 = SLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/sqrt11.f b/lapack-netlib/TESTING/LIN/sqrt11.f index d4422dacb..a3753adcf 100644 --- a/lapack-netlib/TESTING/LIN/sqrt11.f +++ b/lapack-netlib/TESTING/LIN/sqrt11.f @@ -157,9 +157,9 @@ CALL SORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M, $ WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * SQRT11 = SLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( REAL( M )*SLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/sqrt12.f b/lapack-netlib/TESTING/LIN/sqrt12.f index 2eab0ee0d..46b359e07 100644 --- a/lapack-netlib/TESTING/LIN/sqrt12.f +++ b/lapack-netlib/TESTING/LIN/sqrt12.f @@ -26,7 +26,7 @@ *> SQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues - s ||/(||s||*eps*max(M,N)) *> \endverbatim * * Arguments: @@ -113,8 +113,7 @@ EXTERNAL SASUM, SLAMCH, SLANGE, SNRM2 * .. * .. External Subroutines .. - EXTERNAL SAXPY, SBDSQR, SGEBD2, SLABAD, SLASCL, SLASET, - $ XERBLA + EXTERNAL SAXPY, SBDSQR, SGEBD2, SLASCL, SLASET, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, REAL @@ -145,17 +144,16 @@ * Copy upper triangle of A into work * CALL SLASET( 'Full', M, N, ZERO, ZERO, WORK, M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -199,9 +197,9 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN WORK( M*N+I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work diff --git a/lapack-netlib/TESTING/LIN/zchkaa.F b/lapack-netlib/TESTING/LIN/zchkaa.F index a118515a5..57d71833f 100644 --- a/lapack-netlib/TESTING/LIN/zchkaa.F +++ b/lapack-netlib/TESTING/LIN/zchkaa.F @@ -69,6 +69,7 @@ *> ZLQ 8 List types on next line if 0 < NTYPES < 8 *> ZQL 8 List types on next line if 0 < NTYPES < 8 *> ZQP 6 List types on next line if 0 < NTYPES < 6 +*> ZQK 19 List types on next line if 0 < NTYPES < 19 *> ZTZ 3 List types on next line if 0 < NTYPES < 3 *> ZLS 6 List types on next line if 0 < NTYPES < 6 *> ZEQ @@ -153,12 +154,11 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION S( 2*NMAX ) - COMPLEX*16 E( NMAX ) -* -* .. Allocatable Arrays .. +* .. +* .. Allocatable Arrays .. INTEGER AllocateStatus - DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE:: RWORK + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE:: RWORK, S + COMPLEX*16, DIMENSION(:), ALLOCATABLE :: E COMPLEX*16, DIMENSION(:,:), ALLOCATABLE:: A, B, WORK * .. * .. External Functions .. @@ -170,15 +170,16 @@ EXTERNAL ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE, $ ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP, $ ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS, - $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ, - $ ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK, - $ ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ, - $ ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK, - $ ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP, - $ ZDRVLS, ZDRVPB, ZDRVPO, ZDRVPP, ZDRVPT, - $ ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK, - $ ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, - $ ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR + $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQP3RK, ZCHKQL, + $ ZCHKQR, ZCHKRQ, ZCHKSP, ZCHKSY, ZCHKSY_ROOK, + $ ZCHKSY_RK, ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, + $ ZCHKTZ, ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, + $ ZDRVHE_ROOK, ZDRVHE_RK, ZDRVHE_AA, + $ ZDRVHE_AA_2STAGE, ZDRVHP, ZDRVLS, ZDRVPB, + $ ZDRVPO, ZDRVPP, ZDRVPT, ZDRVSP, ZDRVSY, + $ ZDRVSY_ROOK, ZDRVSY_RK, ZDRVSY_AA, + $ ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, ZCHKQRTP, + $ ZCHKLQT, ZCHKLQTP, ZCHKTSQR * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -197,13 +198,18 @@ DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / * * .. Allocate memory dynamically .. - ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus) +* + ALLOCATE ( A ( (KDMAX+1) * NMAX, 7 ), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B ( NMAX * MAXRHS, 4 ), STAT = AllocateStatus) + IF (AllocateStatus /= 0 ) STOP "*** Not enough memory ***" + ALLOCATE ( WORK ( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus) + ALLOCATE ( E( NMAX ), STAT = AllocateStatus ) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus) + ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" - ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus) + ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus) IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. @@ -1109,6 +1115,23 @@ ELSE WRITE( NOUT, FMT = 9989 )PATH END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* QK: truncated QR factorization with pivoting +* + NTYPES = 19 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ S( 1 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF * ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/zchkqp3rk.f b/lapack-netlib/TESTING/LIN/zchkqp3rk.f new file mode 100644 index 000000000..302c7b1a8 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zchkqp3rk.f @@ -0,0 +1,836 @@ +*> \brief \b ZCHKQP3RK +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, +* $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, +* $ B, COPYB, S, TAU, +* $ WORK, RWORK, IWORK, NOUT ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER NM, NN, NNB, NOUT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* LOGICAL DOTYPE( * ) +* INTEGER IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ), +* $ NXVAL( * ) +* DOUBLE PRECISION S( * ), RWORK( * ) +* COMPLEX*16 A( * ), COPYA( * ), TAU( * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZCHKQP3RK tests ZGEQP3RK. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DOTYPE +*> \verbatim +*> DOTYPE is LOGICAL array, dimension (NTYPES) +*> The matrix types to be used for testing. Matrices of type j +*> (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) = +*> .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used. +*> \endverbatim +*> +*> \param[in] NM +*> \verbatim +*> NM is INTEGER +*> The number of values of M contained in the vector MVAL. +*> \endverbatim +*> +*> \param[in] MVAL +*> \verbatim +*> MVAL is INTEGER array, dimension (NM) +*> The values of the matrix row dimension M. +*> \endverbatim +*> +*> \param[in] NN +*> \verbatim +*> NN is INTEGER +*> The number of values of N contained in the vector NVAL. +*> \endverbatim +*> +*> \param[in] NVAL +*> \verbatim +*> NVAL is INTEGER array, dimension (NN) +*> The values of the matrix column dimension N. +*> \endverbatim +*> +*> \param[in] NNS +*> \verbatim +*> NNS is INTEGER +*> The number of values of NRHS contained in the vector NSVAL. +*> \endverbatim +*> +*> \param[in] NSVAL +*> \verbatim +*> NSVAL is INTEGER array, dimension (NNS) +*> The values of the number of right hand sides NRHS. +*> \endverbatim +*> \param[in] NNB +*> \verbatim +*> NNB is INTEGER +*> The number of values of NB and NX contained in the +*> vectors NBVAL and NXVAL. The blocking parameters are used +*> in pairs (NB,NX). +*> \endverbatim +*> +*> \param[in] NBVAL +*> \verbatim +*> NBVAL is INTEGER array, dimension (NNB) +*> The values of the blocksize NB. +*> \endverbatim +*> +*> \param[in] NXVAL +*> \verbatim +*> NXVAL is INTEGER array, dimension (NNB) +*> The values of the crossover point NX. +*> \endverbatim +*> +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> The threshold value for the test ratios. A result is +*> included in the output file if RESULT >= THRESH. To have +*> every test ratio printed, use THRESH = 0. +*> \endverbatim +*> +*> \param[out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (MMAX*NMAX) +*> where MMAX is the maximum value of M in MVAL and NMAX is the +*> maximum value of N in NVAL. +*> \endverbatim +*> +*> \param[out] COPYA +*> \verbatim +*> COPYA is COMPLEX*16 array, dimension (MMAX*NMAX) +*> \endverbatim +*> +*> \param[out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (MMAX*NSMAX) +*> where MMAX is the maximum value of M in MVAL and NSMAX is the +*> maximum value of NRHS in NSVAL. +*> \endverbatim +*> +*> \param[out] COPYB +*> \verbatim +*> COPYB is COMPLEX*16 array, dimension (MMAX*NSMAX) +*> \endverbatim +*> +*> \param[out] S +*> \verbatim +*> S is DOUBLE PRECISION array, dimension +*> (min(MMAX,NMAX)) +*> \endverbatim +*> +*> \param[out] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (MMAX) +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension +*> (max(M*max(M,N) + 4*min(M,N) + max(M,N))) +*> \endverbatim +*> +*> \param[out] RWORK +*> \verbatim +*> RWORK is DOUBLE PRECISION array, dimension (4*NMAX) +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (2*NMAX) +*> \endverbatim +*> +*> \param[in] NOUT +*> \verbatim +*> NOUT is INTEGER +*> The unit number for output. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16_lin +* +* ===================================================================== + SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, + $ NNB, NBVAL, NXVAL, THRESH, A, COPYA, + $ B, COPYB, S, TAU, + $ WORK, RWORK, IWORK, NOUT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER NM, NN, NNB, NNS, NOUT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + LOGICAL DOTYPE( * ) + INTEGER IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ), + $ NSVAL( * ), NXVAL( * ) + DOUBLE PRECISION S( * ), RWORK( * ) + COMPLEX*16 A( * ), COPYA( * ), B( * ), COPYB( * ), + $ TAU( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NTYPES + PARAMETER ( NTYPES = 19 ) + INTEGER NTESTS + PARAMETER ( NTESTS = 5 ) + DOUBLE PRECISION ONE, ZERO, BIGNUM + COMPLEX*16 CONE, CZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0, + $ CZERO = ( 0.0D+0, 0.0D+0 ), + $ CONE = ( 1.0D+0, 0.0D+0 ), + $ BIGNUM = 1.0D+38 ) +* .. +* .. Local Scalars .. + CHARACTER DIST, TYPE + CHARACTER*3 PATH + INTEGER I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO, + $ INB, IND_OFFSET_GEN, + $ IND_IN, IND_OUT, INS, INFO, + $ ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO, + $ KFACT, KL, KMAX, KU, LDA, LW, LWORK, + $ LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N, + $ NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS, + $ NRUN, NX, T + DOUBLE PRECISION ANORM, CNDNUM, EPS, ABSTOL, RELTOL, + $ DTEMP, MAXC2NRMK, RELMAXC2NRMK +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ), ISEEDY( 4 ) + DOUBLE PRECISION RESULT( NTESTS ), RDUMMY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE + EXTERNAL DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ALAERH, ALAHD, ALASUM, DLAORD, ICOPY, ZAXPY, + $ XLAENV, ZGEQP3RK, ZLACPY, ZLASET, ZLATB4, + $ ZLATMS, ZUNMQR, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, MIN, MOD +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, IOUNIT, ZUNMQR_LWORK +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, IOUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEEDY / 1988, 1989, 1990, 1991 / +* .. +* .. Executable Statements .. +* +* Initialize constants and the random number seed. +* + PATH( 1: 1 ) = 'Zomplex precision' + PATH( 2: 3 ) = 'QK' + NRUN = 0 + NFAIL = 0 + NERRS = 0 + DO I = 1, 4 + ISEED( I ) = ISEEDY( I ) + END DO + EPS = DLAMCH( 'Epsilon' ) + INFOT = 0 +* + DO IM = 1, NM +* +* Do for each value of M in MVAL. +* + M = MVAL( IM ) + LDA = MAX( 1, M ) +* + DO IN = 1, NN +* +* Do for each value of N in NVAL. +* + N = NVAL( IN ) + MINMN = MIN( M, N ) + LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ), + $ M*N + 2*MINMN + 4*N ) +* + DO INS = 1, NNS + NRHS = NSVAL( INS ) +* +* Set up parameters with ZLATB4 and generate +* M-by-NRHS B matrix with ZLATMS. +* IMAT = 14: +* Random matrix, CNDNUM = 2, NORM = ONE, +* MODE = 3 (geometric distribution of singular values). +* + CALL ZLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'ZLATMS' + CALL ZLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYB, LDA, WORK, INFO ) +* +* Check error code from ZLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, + $ NRHS, -1, -1, -1, 6, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + DO IMAT = 1, NTYPES +* +* Do the tests only if DOTYPE( IMAT ) is true. +* + IF( .NOT.DOTYPE( IMAT ) ) + $ CYCLE +* +* The type of distribution used to generate the random +* eigen-/singular values: +* ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 ) +* +* Do for each type of NON-SYMMETRIC matrix: CNDNUM NORM MODE +* 1. Zero matrix +* 2. Random, Diagonal, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 3. Random, Upper triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 4. Random, Lower triangular, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 5. Random, First column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 6. Random, Last MINMN column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 7. Random, Last N column is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 8. Random, Middle column in MINMN is zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 9. Random, First half of MINMN columns are zero, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 11. Random, Half MINMN columns in the middle are zero starting +* from MINMN/2-(MINMN/2)/2+1, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 12. Random, Odd columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 13. Random, Even columns are ZERO, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 14. Random, CNDNUM = 2 CNDNUM = 2 ONE 3 ( geometric distribution of singular values ) +* 15. Random, CNDNUM = sqrt(0.1/EPS) CNDNUM = BADC1 = sqrt(0.1/EPS) ONE 3 ( geometric distribution of singular values ) +* 16. Random, CNDNUM = 0.1/EPS CNDNUM = BADC2 = 0.1/EPS ONE 3 ( geometric distribution of singular values ) +* 17. Random, CNDNUM = 0.1/EPS, CNDNUM = BADC2 = 0.1/EPS ONE 2 ( one small singular value, S(N)=1/CNDNUM ) +* one small singular value S(N)=1/CNDNUM +* 18. Random, CNDNUM = 2, scaled near underflow CNDNUM = 2 SMALL = SAFMIN +* 19. Random, CNDNUM = 2, scaled near overflow CNDNUM = 2 LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) ) 3 ( geometric distribution of singular values ) +* + IF( IMAT.EQ.1 ) THEN +* +* Matrix 1: Zero matrix +* + CALL ZLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA ) + DO I = 1, MINMN + S( I ) = ZERO + END DO +* + ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 ) + $ .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN +* +* Matrices 2-5. +* +* Set up parameters with DLATB4 and generate a test +* matrix with ZLATMS. +* + CALL ZLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM, + $ MODE, CNDNUM, DIST ) +* + SRNAMT = 'ZLATMS' + CALL ZLATMS( M, N, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA, LDA, WORK, INFO ) +* +* Check error code from ZLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, + $ NOUT ) + CYCLE + END IF +* + CALL DLAORD( 'Decreasing', MINMN, S, 1 ) +* + ELSE IF( MINMN.GE.2 + $ .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN +* +* Rectangular matrices 5-13 that contain zero columns, +* only for matrices MINMN >=2. +* +* JB_ZERO is the column index of ZERO block. +* NB_ZERO is the column block size of ZERO block. +* NB_GEN is the column blcok size of the +* generated block. +* J_INC in the non_zero column index increment +* for matrix 12 and 13. +* J_FIRS_NZ is the index of the first non-zero +* column. +* + IF( IMAT.EQ.5 ) THEN +* +* First column is zero. +* + JB_ZERO = 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.6 ) THEN +* +* Last column MINMN is zero. +* + JB_ZERO = MINMN + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.7 ) THEN +* +* Last column N is zero. +* + JB_ZERO = N + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.8 ) THEN +* +* Middle column in MINMN is zero. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.9 ) THEN +* +* First half of MINMN columns is zero. +* + JB_ZERO = 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.10 ) THEN +* +* Last columns are zero columns, +* starting from (MINMN / 2 + 1) column. +* + JB_ZERO = MINMN / 2 + 1 + NB_ZERO = N - JB_ZERO + 1 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.11 ) THEN +* +* Half of the columns in the middle of MINMN +* columns is zero, starting from +* MINMN/2 - (MINMN/2)/2 + 1 column. +* + JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1 + NB_ZERO = MINMN / 2 + NB_GEN = N - NB_ZERO +* + ELSE IF( IMAT.EQ.12 ) THEN +* +* Odd-numbered columns are zero, +* + NB_GEN = N / 2 + NB_ZERO = N - NB_GEN + J_INC = 2 + J_FIRST_NZ = 2 +* + ELSE IF( IMAT.EQ.13 ) THEN +* +* Even-numbered columns are zero. +* + NB_ZERO = N / 2 + NB_GEN = N - NB_ZERO + J_INC = 2 + J_FIRST_NZ = 1 +* + END IF +* +* +* 1) Set the first NB_ZERO columns in COPYA(1:M,1:N) +* to zero. +* + CALL ZLASET( 'Full', M, NB_ZERO, CZERO, CZERO, + $ COPYA, LDA ) +* +* 2) Generate an M-by-(N-NB_ZERO) matrix with the +* chosen singular value distribution +* in COPYA(1:M,NB_ZERO+1:N). +* + CALL ZLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU, + $ ANORM, MODE, CNDNUM, DIST ) +* + SRNAMT = 'ZLATMS' +* + IND_OFFSET_GEN = NB_ZERO * LDA +* + CALL ZLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE, + $ CNDNUM, ANORM, KL, KU, 'No packing', + $ COPYA( IND_OFFSET_GEN + 1 ), LDA, + $ WORK, INFO ) +* +* Check error code from ZLATMS. +* + IF( INFO.NE.0 ) THEN + CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, + $ NB_GEN, -1, -1, -1, IMAT, NFAIL, + $ NERRS, NOUT ) + CYCLE + END IF +* +* 3) Swap the gererated colums from the right side +* NB_GEN-size block in COPYA into correct column +* positions. +* + IF( IMAT.EQ.6 + $ .OR. IMAT.EQ.7 + $ .OR. IMAT.EQ.8 + $ .OR. IMAT.EQ.10 + $ .OR. IMAT.EQ.11 ) THEN +* +* Move by swapping the generated columns +* from the right NB_GEN-size block from +* (NB_ZERO+1:NB_ZERO+JB_ZERO) +* into columns (1:JB_ZERO-1). +* + DO J = 1, JB_ZERO-1, 1 + CALL ZSWAP( M, + $ COPYA( ( NB_ZERO+J-1)*LDA+1), 1, + $ COPYA( (J-1)*LDA + 1 ), 1 ) + END DO +* + ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN +* +* ( IMAT = 12, Odd-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the even zero colums in the +* left NB_ZERO-size block. +* +* ( IMAT = 13, Even-numbered ZERO columns. ) +* Swap the generated columns from the right +* NB_GEN-size block into the odd zero colums in the +* left NB_ZERO-size block. +* + DO J = 1, NB_GEN, 1 + IND_OUT = ( NB_ZERO+J-1 )*LDA + 1 + IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA + $ + 1 + CALL ZSWAP( M, + $ COPYA( IND_OUT ), 1, + $ COPYA( IND_IN), 1 ) + END DO +* + END IF +* +* 5) Order the singular values generated by +* DLAMTS in decreasing order and add trailing zeros +* that correspond to zero columns. +* The total number of singular values is MINMN. +* + MINMNB_GEN = MIN( M, NB_GEN ) +* + CALL DLAORD( 'Decreasing', MINMNB_GEN, S, 1 ) + + DO I = MINMNB_GEN+1, MINMN + S( I ) = ZERO + END DO +* + ELSE +* +* IF(MINMN.LT.2) skip this size for this matrix type. +* + CYCLE + END IF +* +* Initialize a copy array for a pivot array for DGEQP3RK. +* + DO I = 1, N + IWORK( I ) = 0 + END DO +* + DO INB = 1, NNB +* +* Do for each pair of values (NB,NX) in NBVAL and NXVAL. +* + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + NX = NXVAL( INB ) + CALL XLAENV( 3, NX ) +* +* We do MIN(M,N)+1 because we need a test for KMAX > N, +* when KMAX is larger than MIN(M,N), KMAX should be +* KMAX = MIN(M,N) +* + DO KMAX = 0, MIN(M,N)+1 +* +* Get a working copy of COPYA into A( 1:M,1:N ). +* Get a working copy of COPYB into A( 1:M, (N+1):NRHS ). +* Get a working copy of COPYB into into B( 1:M, 1:NRHS ). +* Get a working copy of IWORK(1:N) awith zeroes into +* which is going to be used as pivot array IWORK( N+1:2N ). +* NOTE: IWORK(2N+1:3N) is going to be used as a WORK array +* for the routine. +* + CALL ZLACPY( 'All', M, N, COPYA, LDA, A, LDA ) + CALL ZLACPY( 'All', M, NRHS, COPYB, LDA, + $ A( LDA*N + 1 ), LDA ) + CALL ZLACPY( 'All', M, NRHS, COPYB, LDA, + $ B, LDA ) + CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 ) +* + ABSTOL = -1.0 + RELTOl = -1.0 +* +* Compute the QR factorization with pivoting of A +* + LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ), + $ 3*N + NRHS - 1 ) ) +* +* Compute ZGEQP3RK factorization of A. +* + SRNAMT = 'ZGEQP3RK' + CALL ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ A, LDA, KFACT, MAXC2NRMK, + $ RELMAXC2NRMK, IWORK( N+1 ), TAU, + $ WORK, LW, RWORK, IWORK( 2*N+1 ), + $ INFO ) +* +* Check error code from ZGEQP3RK. +* + IF( INFO.LT.0 ) + $ CALL ALAERH( PATH, 'ZGEQP3RK', INFO, 0, ' ', + $ M, N, NX, -1, NB, IMAT, + $ NFAIL, NERRS, NOUT ) +* + IF( KFACT.EQ.MINMN ) THEN +* +* Compute test 1: +* +* This test in only for the full rank factorization of +* the matrix A. +* +* Array S(1:min(M,N)) contains svd(A) the sigular values +* of the original matrix A in decreasing absolute value +* order. The test computes svd(R), the vector sigular +* values of the upper trapezoid of A(1:M,1:N) that +* contains the factor R, in decreasing order. The test +* returns the ratio: +* +* 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS ) +* + RESULT( 1 ) = ZQRT12( M, N, A, LDA, S, WORK, + $ LWORK , RWORK ) +* + DO T = 1, 1 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, NB, NX, + $ IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 1 +* + END IF + +* Compute test 2: +* +* The test returns the ratio: +* +* 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS ) +* + RESULT( 2 ) = ZQPT01( M, N, KFACT, COPYA, A, LDA, TAU, + $ IWORK( N+1 ), WORK, LWORK ) +* +* Compute test 3: +* +* The test returns the ratio: +* +* 1-norm( Q**T * Q - I ) / ( M * EPS ) +* + RESULT( 3 ) = ZQRT11( M, KFACT, A, LDA, TAU, WORK, + $ LWORK ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 2, 3 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 +* +* Compute test 4: +* +* This test is only for the factorizations with the +* rank greater than 2. +* The elements on the diagonal of R should be non- +* increasing. +* +* The test returns the ratio: +* +* Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)), +* K=1:KFACT-1 +* + IF( MIN(KFACT, MINMN).GE.2 ) THEN +* + DO J = 1, KFACT-1, 1 +* + DTEMP = (( ABS( A( (J-1)*M+J ) ) - + $ ABS( A( (J)*M+J+1 ) ) ) / + $ ABS( A(1) ) ) +* + IF( DTEMP.LT.ZERO ) THEN + RESULT( 4 ) = BIGNUM + END IF +* + END DO +* +* Print information about the tests that did not +* pass the threshold. +* + DO T = 4, 4 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', + $ M, N, NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, + $ RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End test 4. +* + END IF +* +* Compute test 5: +* +* This test in only for matrix A with min(M,N) > 0. +* +* The test returns the ratio: +* +* 1-norm(Q**T * B - Q**T * B ) / +* ( M * EPS ) +* +* (1) Compute B:=Q**T * B in the matrix B. +* + IF( MINMN.GT.0 ) THEN +* + LWORK_MQR = MAX(1, NRHS) + CALL ZUNMQR( 'Left', 'Conjugate transpose', + $ M, NRHS, KFACT, A, LDA, TAU, B, LDA, + $ WORK, LWORK_MQR, INFO ) +* + DO I = 1, NRHS +* +* Compare N+J-th column of A and J-column of B. +* + CALL ZAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1, + $ B( ( I-1 )*LDA+1 ), 1 ) + END DO +* + RESULT( 5 ) = + $ ABS( + $ ZLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) / + $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) + $ ) +* +* Print information about the tests that did not pass +* the threshold. +* + DO T = 5, 5 + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N, + $ NRHS, KMAX, ABSTOL, RELTOL, + $ NB, NX, IMAT, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 1 +* +* End compute test 5. +* + END IF +* +* END DO KMAX = 1, MIN(M,N)+1 +* + END DO +* +* END DO for INB = 1, NNB +* + END DO +* +* END DO for IMAT = 1, NTYPES +* + END DO +* +* END DO for INS = 1, NNS +* + END DO +* +* END DO for IN = 1, NN +* + END DO +* +* END DO for IM = 1, NM +* + END DO +* +* Print a summary of the results. +* + CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) +* + 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5, + $ ', KMAX =', I5, ', ABSTOL =', G12.5, + $ ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4, + $ ', type ', I2, ', test ', I2, ', ratio =', G12.5 ) +* +* End of ZCHKQP3RK +* + END diff --git a/lapack-netlib/TESTING/LIN/zlatb4.f b/lapack-netlib/TESTING/LIN/zlatb4.f index a6977f4e9..a2b19f83d 100644 --- a/lapack-netlib/TESTING/LIN/zlatb4.f +++ b/lapack-netlib/TESTING/LIN/zlatb4.f @@ -154,9 +154,6 @@ * .. Intrinsic Functions .. INTRINSIC ABS, MAX, SQRT * .. -* .. External Subroutines .. - EXTERNAL DLABAD -* .. * .. Save statement .. SAVE EPS, SMALL, LARGE, BADC1, BADC2, FIRST * .. @@ -174,11 +171,6 @@ BADC1 = SQRT( BADC2 ) SMALL = DLAMCH( 'Safe minimum' ) LARGE = ONE / SMALL -* -* If it looks like we're on a Cray, take the square root of -* SMALL and LARGE to avoid overflow and underflow problems. -* - CALL DLABAD( SMALL, LARGE ) SMALL = SHRINK*( SMALL / EPS ) LARGE = ONE / SMALL END IF @@ -233,6 +225,110 @@ ELSE ANORM = ONE END IF +* + ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN +* +* xQK: truncated QR with pivoting. +* Set parameters to generate a general +* M x N matrix. +* +* Set TYPE, the type of matrix to be generated. 'N' is nonsymmetric. +* + TYPE = 'N' +* +* Set DIST, the type of distribution for the random +* number generator. 'S' is +* + DIST = 'S' +* +* Set the lower and upper bandwidths. +* + IF( IMAT.EQ.2 ) THEN +* +* 2. Random, Diagonal, CNDNUM = 2 +* + KL = 0 + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.3 ) THEN +* +* 3. Random, Upper triangular, CNDNUM = 2 +* + KL = 0 + KU = MAX( N-1, 0 ) + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE IF( IMAT.EQ.4 ) THEN +* +* 4. Random, Lower triangular, CNDNUM = 2 +* + KL = MAX( M-1, 0 ) + KU = 0 + CNDNUM = TWO + ANORM = ONE + MODE = 3 + ELSE +* +* 5.-19. Rectangular matrix +* + KL = MAX( M-1, 0 ) + KU = MAX( N-1, 0 ) +* + IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN +* +* 5.-14. Random, CNDNUM = 2. +* + CNDNUM = TWO + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.15 ) THEN +* +* 15. Random, CNDNUM = sqrt(0.1/EPS) +* + CNDNUM = BADC1 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.16 ) THEN +* +* 16. Random, CNDNUM = 0.1/EPS +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 3 +* + ELSE IF( IMAT.EQ.17 ) THEN +* +* 17. Random, CNDNUM = 0.1/EPS, +* one small singular value S(N)=1/CNDNUM +* + CNDNUM = BADC2 + ANORM = ONE + MODE = 2 +* + ELSE IF( IMAT.EQ.18 ) THEN +* +* 18. Random, scaled near underflow +* + CNDNUM = TWO + ANORM = SMALL + MODE = 3 +* + ELSE IF( IMAT.EQ.19 ) THEN +* +* 19. Random, scaled near overflow +* + CNDNUM = TWO + ANORM = LARGE + MODE = 3 +* + END IF +* + END IF * ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN * @@ -517,17 +613,18 @@ * * Set the norm and condition number. * - IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN + MAT = ABS( IMAT ) + IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN CNDNUM = BADC1 - ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN + ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN CNDNUM = BADC2 ELSE CNDNUM = TWO END IF * - IF( IMAT.EQ.4 ) THEN + IF( MAT.EQ.4 ) THEN ANORM = SMALL - ELSE IF( IMAT.EQ.5 ) THEN + ELSE IF( MAT.EQ.5 ) THEN ANORM = LARGE ELSE ANORM = ONE diff --git a/lapack-netlib/TESTING/LIN/zqpt01.f b/lapack-netlib/TESTING/LIN/zqpt01.f index 4e53f92c8..c69eb658f 100644 --- a/lapack-netlib/TESTING/LIN/zqpt01.f +++ b/lapack-netlib/TESTING/LIN/zqpt01.f @@ -33,7 +33,7 @@ *> Householder vectors, and the rest of AF contains a partially updated *> matrix. *> -*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M) +*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ) *> \endverbatim * * Arguments: @@ -172,28 +172,28 @@ * NORMA = ZLANGE( 'One-norm', M, N, A, LDA, RWORK ) * - DO 30 J = 1, K - DO 10 I = 1, MIN( J, M ) + DO J = 1, K + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = AF( I, J ) - 10 CONTINUE - DO 20 I = J + 1, M + END DO + DO I = J + 1, M WORK( ( J-1 )*M+I ) = ZERO - 20 CONTINUE - 30 CONTINUE - DO 40 J = K + 1, N + END DO + END DO + DO J = K + 1, N CALL ZCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 ) - 40 CONTINUE + END DO * CALL ZUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK, $ M, WORK( M*N+1 ), LWORK-M*N, INFO ) * - DO 50 J = 1, N + DO J = 1, N * * Compare i-th column of QR and jpvt(i)-th column of A * CALL ZAXPY( M, DCMPLX( -ONE ), A( 1, JPVT( J ) ), 1, $ WORK( ( J-1 )*M+1 ), 1 ) - 50 CONTINUE + END DO * ZQPT01 = ZLANGE( 'One-norm', M, N, WORK, M, RWORK ) / $ ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/zqrt11.f b/lapack-netlib/TESTING/LIN/zqrt11.f index c3be59c36..dc4af744f 100644 --- a/lapack-netlib/TESTING/LIN/zqrt11.f +++ b/lapack-netlib/TESTING/LIN/zqrt11.f @@ -158,9 +158,9 @@ CALL ZUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU, $ WORK, M, WORK( M*M+1 ), INFO ) * - DO 10 J = 1, M + DO J = 1, M WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE - 10 CONTINUE + END DO * ZQRT11 = ZLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) / $ ( DBLE( M )*DLAMCH( 'Epsilon' ) ) diff --git a/lapack-netlib/TESTING/LIN/zqrt12.f b/lapack-netlib/TESTING/LIN/zqrt12.f index 0da6be157..91477b5ea 100644 --- a/lapack-netlib/TESTING/LIN/zqrt12.f +++ b/lapack-netlib/TESTING/LIN/zqrt12.f @@ -28,7 +28,7 @@ *> ZQRT12 computes the singular values `svlues' of the upper trapezoid *> of A(1:M,1:N) and returns the ratio *> -*> || s - svlues||/(||svlues||*eps*max(M,N)) +*> || svlues - s||/(||s||*eps*max(M,N)) *> \endverbatim * * Arguments: @@ -125,8 +125,8 @@ EXTERNAL DASUM, DLAMCH, DNRM2, ZLANGE * .. * .. External Subroutines .. - EXTERNAL DAXPY, DBDSQR, DLABAD, DLASCL, XERBLA, ZGEBD2, - $ ZLASCL, ZLASET + EXTERNAL DAXPY, DBDSQR, DLASCL, XERBLA, ZGEBD2, ZLASCL, + $ ZLASET * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCMPLX, MAX, MIN @@ -154,17 +154,16 @@ * CALL ZLASET( 'Full', M, N, DCMPLX( ZERO ), DCMPLX( ZERO ), WORK, $ M ) - DO 20 J = 1, N - DO 10 I = 1, MIN( J, M ) + DO J = 1, N + DO I = 1, MIN( J, M ) WORK( ( J-1 )*M+I ) = A( I, J ) - 10 CONTINUE - 20 CONTINUE + END DO + END DO * * Get machine parameters * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale work if max entry outside range [SMLNUM,BIGNUM] * @@ -208,9 +207,9 @@ * ELSE * - DO 30 I = 1, MN + DO I = 1, MN RWORK( I ) = ZERO - 30 CONTINUE + END DO END IF * * Compare s and singular values of work @@ -218,6 +217,7 @@ CALL DAXPY( MN, -ONE, S, 1, RWORK( 1 ), 1 ) ZQRT12 = DASUM( MN, RWORK( 1 ), 1 ) / $ ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) ) +* IF( NRMSVL.NE.ZERO ) $ ZQRT12 = ZQRT12 / NRMSVL * From 0eb8a87977531b50a70011bad39f90ae57953e06 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Nov 2023 09:56:37 +0100 Subject: [PATCH 419/718] Implement truncated QR with pivoting (Reference-LAPACK PR 891) --- lapack-netlib/TESTING/ctest.in | 1 + lapack-netlib/TESTING/dtest.in | 1 + lapack-netlib/TESTING/stest.in | 1 + lapack-netlib/TESTING/ztest.in | 1 + 4 files changed, 4 insertions(+) diff --git a/lapack-netlib/TESTING/ctest.in b/lapack-netlib/TESTING/ctest.in index a3588b4a1..74ff31ab8 100644 --- a/lapack-netlib/TESTING/ctest.in +++ b/lapack-netlib/TESTING/ctest.in @@ -42,6 +42,7 @@ CRQ 8 List types on next line if 0 < NTYPES < 8 CLQ 8 List types on next line if 0 < NTYPES < 8 CQL 8 List types on next line if 0 < NTYPES < 8 CQP 6 List types on next line if 0 < NTYPES < 6 +CQK 19 List types on next line if 0 < NTYPES < 19 CTZ 3 List types on next line if 0 < NTYPES < 3 CLS 6 List types on next line if 0 < NTYPES < 6 CEQ diff --git a/lapack-netlib/TESTING/dtest.in b/lapack-netlib/TESTING/dtest.in index 29bb8b92e..1b6c7bd4a 100644 --- a/lapack-netlib/TESTING/dtest.in +++ b/lapack-netlib/TESTING/dtest.in @@ -36,6 +36,7 @@ DRQ 8 List types on next line if 0 < NTYPES < 8 DLQ 8 List types on next line if 0 < NTYPES < 8 DQL 8 List types on next line if 0 < NTYPES < 8 DQP 6 List types on next line if 0 < NTYPES < 6 +DQK 19 LIst types on next line if 0 < NTYPES < 19 DTZ 3 List types on next line if 0 < NTYPES < 3 DLS 6 List types on next line if 0 < NTYPES < 6 DEQ diff --git a/lapack-netlib/TESTING/stest.in b/lapack-netlib/TESTING/stest.in index 27ac30040..7faa8b7a1 100644 --- a/lapack-netlib/TESTING/stest.in +++ b/lapack-netlib/TESTING/stest.in @@ -36,6 +36,7 @@ SRQ 8 List types on next line if 0 < NTYPES < 8 SLQ 8 List types on next line if 0 < NTYPES < 8 SQL 8 List types on next line if 0 < NTYPES < 8 SQP 6 List types on next line if 0 < NTYPES < 6 +SQK 19 List types on next line if 0 < NTYPES < 19 STZ 3 List types on next line if 0 < NTYPES < 3 SLS 6 List types on next line if 0 < NTYPES < 6 SEQ diff --git a/lapack-netlib/TESTING/ztest.in b/lapack-netlib/TESTING/ztest.in index 58da33d60..c83e82e45 100644 --- a/lapack-netlib/TESTING/ztest.in +++ b/lapack-netlib/TESTING/ztest.in @@ -42,6 +42,7 @@ ZRQ 8 List types on next line if 0 < NTYPES < 8 ZLQ 8 List types on next line if 0 < NTYPES < 8 ZQL 8 List types on next line if 0 < NTYPES < 8 ZQP 6 List types on next line if 0 < NTYPES < 6 +ZQK 19 List types on next line if 0 < NTYPES < 19 ZTZ 3 List types on next line if 0 < NTYPES < 3 ZLS 6 List types on next line if 0 < NTYPES < 6 ZEQ From 5bf87c86f580982d741d3870740364659f99a4af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Nov 2023 12:10:20 +0100 Subject: [PATCH 420/718] Implement truncated QR with pivoting (Reference-LAPACK PR 891) --- cmake/lapack.cmake | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 22476f561..003a8b3c1 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -52,7 +52,7 @@ set(SLASRC sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f sgels.f sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f - sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f + sgeqp3.f sgeqp3rk.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f sgetrf2.f sgetri.f sggbak.f sggbal.f @@ -67,7 +67,7 @@ set(SLASRC slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f slansy.f slantb.f slantp.f slantr.f slanv2.f slapll.f slapmt.f - slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f + slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f @@ -139,7 +139,7 @@ set(CLASRC cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f - cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f + cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f cgeqp3rk.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f cgesvdx.f cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f @@ -173,7 +173,7 @@ set(CLASRC clanhb.f clanhe.f clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f - claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f + claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f claqz0.f claqz1.f claqz2.f claqz3.f claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f @@ -243,7 +243,7 @@ set(DLASRC dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f dgels.f dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f - dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f + dgeqp3.f dgeqp3rk.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f dgetrf2.f dgetri.f dggbak.f dggbal.f @@ -258,7 +258,7 @@ set(DLASRC dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f dlapll.f dlapmt.f - dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f + dlaqgb.f dlaqge.f dlaqp2.f dlaqp2rk.f dlaqp3rk.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f @@ -331,7 +331,7 @@ set(ZLASRC zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f - zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f + zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f zgeqp3rk.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f @@ -367,7 +367,7 @@ set(ZLASRC zlanhe.f zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f - zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f + zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqp2rk.f zlaqp3rk.f zlaqps.f zlaqsb.f zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f @@ -557,7 +557,7 @@ set(SLASRC sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c sgehd2.c sgehrd.c sgelq2.c sgelqf.c sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c - sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c + sgeqp3.c sgeqp3rk.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c sgetrf2.c sgetri.c sggbak.c sggbal.c @@ -571,7 +571,7 @@ set(SLASRC slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c slansy.c slantb.c slantp.c slantr.c slanv2.c slapll.c slapmt.c - slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c + slaqgb.c slaqge.c slaqp2.c slaqp2rk.c slaqp3rk.c slaqps.c slaqsb.c slaqsp.c slaqsy.c slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c @@ -643,7 +643,7 @@ set(CLASRC cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c cgehd2.c cgehrd.c cgelq2.c cgelqf.c - cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c + cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c cgeqp3rk.c cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c cgesc2.c cgesdd.c cgesvd.c cgesvdx.c cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c @@ -677,7 +677,7 @@ set(CLASRC clanhb.c clanhe.c clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c - claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c + claqhb.c claqhe.c claqhp.c claqp2.c claqp2rk.c claqp3rk.c claqps.c claqsb.c claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c @@ -746,7 +746,7 @@ set(DLASRC dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c dgehd2.c dgehrd.c dgelq2.c dgelqf.c dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c - dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c + dgeqp3.c dgeqp3rk.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c dgetrf2.c dgetri.c dggbak.c dggbal.c @@ -760,7 +760,7 @@ set(DLASRC dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c dlapll.c dlapmt.c - dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c + dlaqgb.c dlaqge.c dlaqp2.c dlaqp2rk.c dlaqp3rk.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c @@ -833,7 +833,7 @@ set(ZLASRC zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c zgehd2.c zgehrd.c zgelq2.c zgelqf.c - zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c + zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c zgeqp3rk.c zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c @@ -868,7 +868,7 @@ set(ZLASRC zlanhe.c zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c - zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c + zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c From f437339130a7e3878a94b2a2ea87edbdd05ba5cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Nov 2023 12:12:26 +0100 Subject: [PATCH 421/718] Implement truncated QR with pivoting (Reference-LAPACK PR 891) --- lapack-netlib/SRC/cgeqp3rk.c | 1071 +++++++++++++++++++++++++++++++ lapack-netlib/SRC/claqp2rk.c | 943 +++++++++++++++++++++++++++ lapack-netlib/SRC/claqp3rk.c | 1152 +++++++++++++++++++++++++++++++++ lapack-netlib/SRC/dgeqp3rk.c | 1059 +++++++++++++++++++++++++++++++ lapack-netlib/SRC/dlaqp2rk.c | 923 +++++++++++++++++++++++++++ lapack-netlib/SRC/dlaqp3rk.c | 1113 ++++++++++++++++++++++++++++++++ lapack-netlib/SRC/sgeqp3rk.c | 1055 +++++++++++++++++++++++++++++++ lapack-netlib/SRC/slaqp2rk.c | 918 +++++++++++++++++++++++++++ lapack-netlib/SRC/slaqp3rk.c | 1109 ++++++++++++++++++++++++++++++++ lapack-netlib/SRC/zgeqp3rk.c | 1074 +++++++++++++++++++++++++++++++ lapack-netlib/SRC/zlaqp2rk.c | 947 ++++++++++++++++++++++++++++ lapack-netlib/SRC/zlaqp3rk.c | 1157 ++++++++++++++++++++++++++++++++++ 12 files changed, 12521 insertions(+) create mode 100644 lapack-netlib/SRC/cgeqp3rk.c create mode 100644 lapack-netlib/SRC/claqp2rk.c create mode 100644 lapack-netlib/SRC/claqp3rk.c create mode 100644 lapack-netlib/SRC/dgeqp3rk.c create mode 100644 lapack-netlib/SRC/dlaqp2rk.c create mode 100644 lapack-netlib/SRC/dlaqp3rk.c create mode 100644 lapack-netlib/SRC/sgeqp3rk.c create mode 100644 lapack-netlib/SRC/slaqp2rk.c create mode 100644 lapack-netlib/SRC/slaqp3rk.c create mode 100644 lapack-netlib/SRC/zgeqp3rk.c create mode 100644 lapack-netlib/SRC/zlaqp2rk.c create mode 100644 lapack-netlib/SRC/zlaqp3rk.c diff --git a/lapack-netlib/SRC/cgeqp3rk.c b/lapack-netlib/SRC/cgeqp3rk.c new file mode 100644 index 000000000..54e7fb140 --- /dev/null +++ b/lapack-netlib/SRC/cgeqp3rk.c @@ -0,0 +1,1071 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + return 0; + } + +/* ================================================================== */ + + eps = slamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.f) { + safmin = slamch_("Safe minimum"); +/* Computing MAX */ + r__1 = *abstol, r__2 = safmin * 2.f; + *abstol = f2cmax(r__1,r__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.f) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1.f <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "CGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + claqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], & + rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], + &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + claqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], & + work[1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &rwork[*k + 1], &c__1); + *maxc2nrmk = rwork[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + + } else { + *maxc2nrmk = 0.f; + *relmaxc2nrmk = 0.f; + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + q__1.r = (real) lwkopt, q__1.i = 0.f; + work[1].r = q__1.r, work[1].i = q__1.i; + + return 0; + +/* End of CGEQP3RK */ + +} /* cgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/claqp2rk.c b/lapack-netlib/SRC/claqp2rk.c new file mode 100644 index 000000000..4184c5927 --- /dev/null +++ b/lapack-netlib/SRC/claqp2rk.c @@ -0,0 +1,943 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + i__3 = j; + tau[i__3].r = 0.f, tau[i__3].i = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = CZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + clarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + i__2 = kk; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since CLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by CLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by CLARFG is covered by checking TAU(KK) for NaN. */ + + i__2 = kk; + r__1 = tau[i__2].r; + if (sisnan_(&r__1)) { + i__2 = kk; + taunan = tau[i__2].r; + } else /* if(complicated condition) */ { + r__1 = r_imag(&tau[kk]); + if (sisnan_(&r__1)) { + taunan = r_imag(&tau[kk]); + } else { + taunan = 0.f; + } + } + + if (sisnan_(&taunan)) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + i__2 = i__ + kk * a_dim1; + aikk.r = a[i__2].r, aikk.i = a[i__2].i; + i__2 = i__ + kk * a_dim1; + a[i__2].r = 1.f, a[i__2].i = 0.f; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + r_cnjg(&q__1, &tau[kk]); + clarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &q__1, + &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + i__2 = i__ + kk * a_dim1; + a[i__2].r = aikk.r, a[i__2].i = aikk.i; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + r__1 = c_abs(&a[i__ + j * a_dim1]) / vn1[j]; + temp = 1.f - r__1 * r__1; + temp = f2cmax(temp,0.f); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = scnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.f; + *relmaxc2nrmk = 0.f; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + + return 0; + +/* End of CLAQP2RK */ + +} /* claqp2rk_ */ + diff --git a/lapack-netlib/SRC/claqp3rk.c b/lapack-netlib/SRC/claqp3rk.c new file mode 100644 index 000000000..ca305fab7 --- /dev/null +++ b/lapack-netlib/SRC/claqp3rk.c @@ -0,0 +1,1152 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.f) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.f; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, & + i__2, kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[* + kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb + + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0.f, tau[i__2].i = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + cswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */ + + if (k > 1) { + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + r_cnjg(&q__1, &f[k + j * f_dim1]); + f[i__2].r = q__1.r, f[i__2].i = q__1.i; + } + i__1 = *m - i__ + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = 0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1); + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + r_cnjg(&q__1, &f[k + j * f_dim1]); + f[i__2].r = q__1.r, f[i__2].i = q__1.i; + } + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + clarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + i__1 = k; + tau[i__1].r = 0.f, tau[i__1].i = 0.f; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since CLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by CLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by CLARFG is covered by checking TAU(K) for NaN. */ + + i__1 = k; + r__1 = tau[i__1].r; + if (sisnan_(&r__1)) { + i__1 = k; + taunan = tau[i__1].r; + } else /* if(complicated condition) */ { + r__1 = r_imag(&tau[k]); + if (sisnan_(&r__1)) { + taunan = r_imag(&tau[k]); + } else { + taunan = 0.f; + } + } + + if (sisnan_(&taunan)) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb, + &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1], + lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + i__1 = i__ + k * a_dim1; + aik.r = a[i__1].r, aik.i = a[i__1].i; + i__1 = i__ + k * a_dim1; + a[i__1].r = 1.f, a[i__1].i = 0.f; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + cgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, & + f[k + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + i__2 = j + k * f_dim1; + f[i__2].r = 0.f, f[i__2].i = 0.f; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + i__3 = k; + q__1.r = -tau[i__3].r, q__1.i = -tau[i__3].i; + cgemv_("Conjugate Transpose", &i__1, &i__2, &q__1, &a[i__ + + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1] + , &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + cgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, & + q__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, & + c_b2, &a[i__ + (k + 1) * a_dim1], lda); + } + + i__1 = i__ + k * a_dim1; + a[i__1].r = aik.r, a[i__1].i = aik.i; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = c_abs(&a[i__ + j * a_dim1]) / vn1[j]; +/* Computing MAX */ + r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp); + temp = f2cmax(r__1,r__2); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &q__1, + &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2, + &a[if__ + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* SCNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(SLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = scnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of CLAQP3RK */ + +} /* claqp3rk_ */ + diff --git a/lapack-netlib/SRC/dgeqp3rk.c b/lapack-netlib/SRC/dgeqp3rk.c new file mode 100644 index 000000000..17a78dd5a --- /dev/null +++ b/lapack-netlib/SRC/dgeqp3rk.c @@ -0,0 +1,1059 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.; + } + work[1] = (doublereal) lwkopt; + return 0; + } + +/* ================================================================== */ + + eps = dlamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.) { + safmin = dlamch_("Safe minimum"); +/* Computing MAX */ + d__1 = *abstol, d__2 = safmin * 2.; + *abstol = f2cmax(d__1,d__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1. <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.; + } + + work[1] = (doublereal) lwkopt; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "DGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "DGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + dlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], & + work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb + + 1], &i__1, &iwork[1], &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + work[1] = (doublereal) lwkopt; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + dlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], & + work[(*n << 1) + 1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &work[*k + 1], &c__1); + *maxc2nrmk = work[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.; + } + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + work[1] = (doublereal) lwkopt; + + return 0; + +/* End of DGEQP3RK */ + +} /* dgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/dlaqp2rk.c b/lapack-netlib/SRC/dlaqp2rk.c new file mode 100644 index 000000000..de216ad97 --- /dev/null +++ b/lapack-netlib/SRC/dlaqp2rk.c @@ -0,0 +1,923 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + tau[j] = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = ZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + dlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + tau[kk] = 0.; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since DLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by DLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by DLARFG is covered by checking TAU(KK) for NaN. */ + + if (disnan_(&tau[kk])) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[kk]; + *relmaxc2nrmk = tau[kk]; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + aikk = a[i__ + kk * a_dim1]; + a[i__ + kk * a_dim1] = 1.; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + dlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[ + kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + a[i__ + kk * a_dim1] = aikk; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + d__2 = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j]; + temp = 1. - d__2 * d__2; + temp = f2cmax(temp,0.); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = dnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.; + *relmaxc2nrmk = 0.; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.; + } + + return 0; + +/* End of DLAQP2RK */ + +} /* dlaqp2rk_ */ + diff --git a/lapack-netlib/SRC/dlaqp3rk.c b/lapack-netlib/SRC/dlaqp3rk.c new file mode 100644 index 000000000..e8c61c257 --- /dev/null +++ b/lapack-netlib/SRC/dlaqp3rk.c @@ -0,0 +1,1113 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + dswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1); + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + dlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + tau[k] = 0.; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since DLARFG cannot produce TAU(K) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by DLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by DLARFG is covered by checking TAU(K) for NaN. */ + + if (disnan_(&tau[k])) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[k]; + *relmaxc2nrmk = tau[k]; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, & + a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, & + c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + aik = a[i__ + k * a_dim1]; + a[i__ + k * a_dim1] = 1.; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + dgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + f[j + k * f_dim1] = 0.; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + d__1 = -tau[k]; + dgemv_("Transpose", &i__1, &i__2, &d__1, &a[i__ + a_dim1], lda, & + a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + dgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, + &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], + lda); + } + + a[i__ + k * a_dim1] = aik; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j]; +/* Computing MAX */ + d__1 = 0., d__2 = (temp + 1.) * (1. - temp); + temp = f2cmax(d__1,d__2); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ + + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ + + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* DNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(DLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = dnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of DLAQP3RK */ + +} /* dlaqp3rk_ */ + diff --git a/lapack-netlib/SRC/sgeqp3rk.c b/lapack-netlib/SRC/sgeqp3rk.c new file mode 100644 index 000000000..fe52901bf --- /dev/null +++ b/lapack-netlib/SRC/sgeqp3rk.c @@ -0,0 +1,1055 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + work[1] = (real) lwkopt; + return 0; + } + +/* ================================================================== */ + + eps = slamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.f) { + safmin = slamch_("Safe minimum"); +/* Computing MAX */ + r__1 = *abstol, r__2 = safmin * 2.f; + *abstol = f2cmax(r__1,r__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.f) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1.f <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.f; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + + work[1] = (real) lwkopt; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + slaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], & + work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb + + 1], &i__1, &iwork[1], &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + work[1] = (real) lwkopt; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + slaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], & + work[(*n << 1) + 1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &work[*k + 1], &c__1); + *maxc2nrmk = work[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + work[1] = (real) lwkopt; + + return 0; + +/* End of SGEQP3RK */ + +} /* sgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/slaqp2rk.c b/lapack-netlib/SRC/slaqp2rk.c new file mode 100644 index 000000000..0bfa71ab9 --- /dev/null +++ b/lapack-netlib/SRC/slaqp2rk.c @@ -0,0 +1,918 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + tau[j] = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = ZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + slarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + tau[kk] = 0.f; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since SLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by SLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by SLARFG is covered by checking TAU(KK) for NaN. */ + + if (sisnan_(&tau[kk])) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[kk]; + *relmaxc2nrmk = tau[kk]; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + aikk = a[i__ + kk * a_dim1]; + a[i__ + kk * a_dim1] = 1.f; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + slarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[ + kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + a[i__ + kk * a_dim1] = aikk; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + r__2 = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j]; + temp = 1.f - r__2 * r__2; + temp = f2cmax(temp,0.f); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = snrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.f; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.f; + *relmaxc2nrmk = 0.f; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + tau[j] = 0.f; + } + + return 0; + +/* End of SLAQP2RK */ + +} /* slaqp2rk_ */ + diff --git a/lapack-netlib/SRC/slaqp3rk.c b/lapack-netlib/SRC/slaqp3rk.c new file mode 100644 index 000000000..e3632538b --- /dev/null +++ b/lapack-netlib/SRC/slaqp3rk.c @@ -0,0 +1,1109 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.f) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.f; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, & + c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + + f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + tau[j] = 0.f; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + sswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1); + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + slarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + tau[k] = 0.f; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since SLARFG cannot produce TAU(K) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by SLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by SLARFG is covered by checking TAU(K) for NaN. */ + + if (sisnan_(&tau[k])) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = tau[k]; + *relmaxc2nrmk = tau[k]; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, & + a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, & + c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + aik = a[i__ + k * a_dim1]; + a[i__ + k * a_dim1] = 1.f; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + sgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + f[j + k * f_dim1] = 0.f; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + r__1 = -tau[k]; + sgemv_("Transpose", &i__1, &i__2, &r__1, &a[i__ + a_dim1], lda, & + a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + sgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, + &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], + lda); + } + + a[i__ + k * a_dim1] = aik; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.f) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j]; +/* Computing MAX */ + r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp); + temp = f2cmax(r__1,r__2); +/* Computing 2nd power */ + r__1 = vn1[j] / vn2[j]; + temp2 = temp * (r__1 * r__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ + + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ + + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* SNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(SLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = snrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of SLAQP3RK */ + +} /* slaqp3rk_ */ + diff --git a/lapack-netlib/SRC/zgeqp3rk.c b/lapack-netlib/SRC/zgeqp3rk.c new file mode 100644 index 000000000..0c8b41c2d --- /dev/null +++ b/lapack-netlib/SRC/zgeqp3rk.c @@ -0,0 +1,1074 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + +/* Check if the matrix A contains +Inf or -Inf, set INFO parameter */ +/* to the column number, where the first +/-Inf is found plus N, */ +/* and continue the computation. */ + + *info = *n + kp1; + + } + +/* ================================================================== */ + +/* Quick return if possible for the case when the first */ +/* stopping criterion is satisfied, i.e. KMAX = 0. */ + + if (*kmax == 0) { + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + return 0; + } + +/* ================================================================== */ + + eps = dlamch_("Epsilon"); + +/* Adjust ABSTOL */ + + if (*abstol >= 0.) { + safmin = dlamch_("Safe minimum"); +/* Computing MAX */ + d__1 = *abstol, d__2 = safmin * 2.; + *abstol = f2cmax(d__1,d__2); + } + +/* Adjust RELTOL */ + + if (*reltol >= 0.) { + *reltol = f2cmax(*reltol,eps); + } + +/* =================================================================== */ + +/* JMAX is the maximum index of the column to be factorized, */ +/* which is also limited by the first stopping criterion KMAX. */ + + jmax = f2cmin(*kmax,minmn); + +/* =================================================================== */ + +/* Quick return if possible for the case when the second or third */ +/* stopping criterion for the whole original matrix is satified, */ +/* i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */ +/* (which is ONE <= RELTOL). */ + + if (maxc2nrm <= *abstol || 1. <= *reltol) { + + *k = 0; + *maxc2nrmk = maxc2nrm; + *relmaxc2nrmk = 1.; + + i__1 = minmn; + for (j = 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + return 0; + } + +/* ================================================================== */ +/* Factorize columns */ +/* ================================================================== */ + +/* Determine the block size. */ + + nbmin = 2; + nx = 0; + + if (nb > 1 && nb < minmn) { + +/* Determine when to cross over from blocked to unblocked code. */ +/* (for N less than NX, unblocked code should be used). */ + +/* Computing MAX */ + i__1 = 0, i__2 = ilaenv_(&c__3, "ZGEQP3RK", " ", m, n, &c_n1, &c_n1, ( + ftnlen)8, (ftnlen)1); + nx = f2cmax(i__1,i__2); + + if (nx < minmn) { + +/* Determine if workspace is large enough for blocked code. */ + + if (*lwork < lwkopt) { + +/* Not enough workspace to use optimal block size that */ +/* is currently stored in NB. */ +/* Reduce NB and determine the minimum value of NB. */ + + nb = (*lwork - (*n << 1)) / (*n + 1); +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "ZGEQP3RK", " ", m, n, &c_n1, + &c_n1, (ftnlen)8, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + } + } + } + +/* ================================================================== */ + +/* DONE is the boolean flag to rerpresent the case when the */ +/* factorization completed in the block factorization routine, */ +/* before the end of the block. */ + + done = FALSE_; + +/* J is the column index. */ + + j = 1; + +/* (1) Use blocked code initially. */ + +/* JMAXB is the maximum column index of the block, when the */ +/* blocked code is used, is also limited by the first stopping */ +/* criterion KMAX. */ + +/* Computing MIN */ + i__1 = *kmax, i__2 = minmn - nx; + jmaxb = f2cmin(i__1,i__2); + + if (nb >= nbmin && nb < jmax && jmaxb > 0) { + +/* Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */ +/* J is the column index of a column block; */ +/* JB is the column block size to pass to block factorization */ +/* routine in a loop step; */ +/* JBF is the number of columns that were actually factorized */ +/* that was returned by the block factorization routine */ +/* in a loop step, JBF <= JB; */ +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + while(j <= jmaxb) { + +/* Computing MIN */ + i__1 = nb, i__2 = jmaxb - j + 1; + jb = f2cmin(i__1,i__2); + n_sub__ = *n - j + 1; + ioffset = j - 1; + +/* Factorize JB columns among the columns A(J:N). */ + + i__1 = *n + *nrhs - j + 1; + zlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, + &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, + maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], & + rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], + &iinfo); + +/* Set INFO on the first occurence of Inf. */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } + + if (done) { + +/* Either the submatrix is zero before the end of the */ +/* column block, or ABSTOL or RELTOL criterion is */ +/* satisfied before the end of the column block, we can */ +/* return from the routine. Perform the following before */ +/* returning: */ +/* a) Set the number of factorized columns K, */ +/* K = IOFFSET + JBF from the last call of blocked */ +/* routine. */ +/* NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */ +/* by the block factorization routine; */ +/* 2) The remaining TAUs are set to ZERO by the */ +/* block factorization routine. */ + + *k = ioffset + jbf; + +/* Set INFO on the first occurrence of NaN, NaN takes */ +/* prcedence over Inf. */ + + if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + +/* Return from the routine. */ + + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + + return 0; + + } + + j += jbf; + + } + + } + +/* Use unblocked code to factor the last or only block. */ +/* J = JMAX+1 means we factorized the maximum possible number of */ +/* columns, that is in ELSE clause we need to compute */ +/* the MAXC2NORM and RELMAXC2NORM to return after we processed */ +/* the blocks. */ + + if (j <= jmax) { + +/* N_SUB is the number of columns in the submatrix; */ +/* IOFFSET is the number of rows that should not be factorized. */ + + n_sub__ = *n - j + 1; + ioffset = j - 1; + + i__1 = jmax - j + 1; + zlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, & + maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, + relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], & + work[1], &iinfo); + +/* ABSTOL or RELTOL criterion is satisfied when the number of */ +/* the factorized columns KF is smaller then the number */ +/* of columns JMAX-J+1 supplied to be factorized by the */ +/* unblocked routine, we can return from */ +/* the routine. Perform the following before returning: */ +/* a) Set the number of factorized columns K, */ +/* b) MAXC2NRMK and RELMAXC2NRMK are returned by the */ +/* unblocked factorization routine above. */ + + *k = j - 1 + kf; + +/* Set INFO on the first exception occurence. */ + +/* Set INFO on the first exception occurence of Inf or NaN, */ +/* (NaN takes precedence over Inf). */ + + if (iinfo > n_sub__ && *info == 0) { + *info = (ioffset << 1) + iinfo; + } else if (iinfo <= n_sub__ && iinfo > 0) { + *info = ioffset + iinfo; + } + + } else { + +/* Compute the return values for blocked code. */ + +/* Set the number of factorized columns if the unblocked routine */ +/* was not called. */ + + *k = jmax; + +/* If there exits a residual matrix after the blocked code: */ +/* 1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */ +/* residual matrix, otherwise set them to ZERO; */ +/* 2) Set TAU(K+1:MINMN) to ZERO. */ + + if (*k < minmn) { + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &rwork[*k + 1], &c__1); + *maxc2nrmk = rwork[jmaxc2nrm]; + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / maxc2nrm; + } + + i__1 = minmn; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + + } else { + *maxc2nrmk = 0.; + *relmaxc2nrmk = 0.; + + } + +/* END IF( J.LE.JMAX ) THEN */ + + } + + z__1.r = (doublereal) lwkopt, z__1.i = 0.; + work[1].r = z__1.r, work[1].i = z__1.i; + + return 0; + +/* End of ZGEQP3RK */ + +} /* zgeqp3rk_ */ + diff --git a/lapack-netlib/SRC/zlaqp2rk.c b/lapack-netlib/SRC/zlaqp2rk.c new file mode 100644 index 000000000..0d38e71fb --- /dev/null +++ b/lapack-netlib/SRC/zlaqp2rk.c @@ -0,0 +1,947 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i myhugeval) { + *info = *n + kk - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL >= ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL >= ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + +/* Set K, the number of factorized columns. */ + + *k = kk - 1; + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */ + + i__2 = minmnfact; + for (j = kk; j <= i__2; ++j) { + i__3 = j; + tau[i__3].r = 0., tau[i__3].i = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,KK:N): */ +/* 1) swap the KK-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) copy the KK-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than KK in the next loop step.) */ +/* 3) Save the pivot interchange with the indices relative to the */ +/* the original matrix A, not the block A(1:M,1:N). */ + + if (kp != kk) { + zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1); + vn1[kp] = vn1[kk]; + vn2[kp] = vn2[kk]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[kk]; + jpiv[kk] = itemp; + } + +/* Generate elementary reflector H(KK) using the column A(I:M,KK), */ +/* if the column has more than one element, otherwise */ +/* the elementary reflector would be an identity matrix, */ +/* and TAU(KK) = CZERO. */ + + if (i__ < *m) { + i__2 = *m - i__ + 1; + zlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], & + c__1, &tau[kk]); + } else { + i__2 = kk; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + +/* Check if TAU(KK) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(KK) for Inf, */ +/* since ZLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by ZLARFG can contain Inf, which requires */ +/* TAU(KK) to contain NaN. Therefore, this case of generating Inf */ +/* by ZLARFG is covered by checking TAU(KK) for NaN. */ + + i__2 = kk; + d__1 = tau[i__2].r; + if (disnan_(&d__1)) { + i__2 = kk; + taunan = tau[i__2].r; + } else /* if(complicated condition) */ { + d__1 = d_imag(&tau[kk]); + if (disnan_(&d__1)) { + taunan = d_imag(&tau[kk]); + } else { + taunan = 0.; + } + } + + if (disnan_(&taunan)) { + *k = kk - 1; + *info = kk; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* Array TAU(KK:MINMNFACT) is not set and contains */ +/* undefined elements, except the first element TAU(KK) = NaN. */ + + return 0; + } + +/* Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */ +/* ( If M >= N, then at KK = N there is no residual matrix, */ +/* i.e. no columns of A to update, only columns of B. */ +/* If M < N, then at KK = M-IOFFSET, I = M and we have a */ +/* one-row residual matrix in A and the elementary */ +/* reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */ +/* is needed for the residual matrix in A and the */ +/* right-hand-side-matrix in B. */ +/* Therefore, we update only if */ +/* KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */ +/* condition is satisfied, not only KK < N+NRHS ) */ + + if (kk < minmnupdt) { + i__2 = i__ + kk * a_dim1; + aikk.r = a[i__2].r, aikk.i = a[i__2].i; + i__2 = i__ + kk * a_dim1; + a[i__2].r = 1., a[i__2].i = 0.; + i__2 = *m - i__ + 1; + i__3 = *n + *nrhs - kk; + d_cnjg(&z__1, &tau[kk]); + zlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &z__1, + &a[i__ + (kk + 1) * a_dim1], lda, &work[1]); + i__2 = i__ + kk * a_dim1; + a[i__2].r = aikk.r, a[i__2].i = aikk.i; + } + + if (kk < minmnfact) { + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */ +/* when KK < f2cmin(M-IOFFSET, N). */ + + i__2 = *n; + for (j = kk + 1; j <= i__2; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + +/* Computing 2nd power */ + d__1 = z_abs(&a[i__ + j * a_dim1]) / vn1[j]; + temp = 1. - d__1 * d__1; + temp = f2cmax(temp,0.); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* Compute the column 2-norm for the partial */ +/* column A(I+1:M,J) by explicitly computing it, */ +/* and store it in both partial 2-norm vector VN1 */ +/* and exact column 2-norm vector VN2. */ + + i__3 = *m - i__; + vn1[j] = dznrm2_(&i__3, &a[i__ + 1 + j * a_dim1], & + c__1); + vn2[j] = vn1[j]; + + } else { + +/* Update the column 2-norm for the partial */ +/* column A(I+1:M,J) by removing one */ +/* element A(I,J) and store it in partial */ +/* 2-norm vector VN1. */ + + vn1[j] *= sqrt(temp); + + } + } + } + + } + +/* End factorization loop */ + + } + +/* If we reached this point, all colunms have been factorized, */ +/* i.e. no condition was triggered to exit the routine. */ +/* Set the number of factorized columns. */ + + *k = *kmax; + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */ +/* we return. */ + + if (*k < minmnfact) { + + i__1 = *n - *k; + jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1); + *maxc2nrmk = vn1[jmaxc2nrm]; + + if (*k == 0) { + *relmaxc2nrmk = 1.; + } else { + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + } + + } else { + *maxc2nrmk = 0.; + *relmaxc2nrmk = 0.; + } + +/* We reached the end of the loop, i.e. all KMAX columns were */ +/* factorized, set TAUs corresponding to the columns that were */ +/* not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */ + + i__1 = minmnfact; + for (j = *k + 1; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + + return 0; + +/* End of ZLAQP2RK */ + +} /* zlaqp2rk_ */ + diff --git a/lapack-netlib/SRC/zlaqp3rk.c b/lapack-netlib/SRC/zlaqp3rk.c new file mode 100644 index 000000000..cb44e4d34 --- /dev/null +++ b/lapack-netlib/SRC/zlaqp3rk.c @@ -0,0 +1,1157 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* Quick return, if the submatrix A(I:M,K:N) is */ +/* a zero matrix. We need to check it only if the column index */ +/* (same as row index) is larger than 1, since the condition */ +/* for the whole original matrix A_orig is checked in the main */ +/* routine. */ + + if (*maxc2nrmk == 0.) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *relmaxc2nrmk = 0.; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix is zero and we stop the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, + kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * + a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* Check if the submatrix A(I:M,K:N) contains Inf, */ +/* set INFO parameter to the column number, where */ +/* the first Inf is found plus N, and continue */ +/* the computation. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + if (*info == 0 && *maxc2nrmk > myhugeval) { + *info = *n + k - 1 + kp; + } + +/* ============================================================ */ + +/* Test for the second and third tolerance stopping criteria. */ +/* NOTE: There is no need to test for ABSTOL.GE.ZERO, since */ +/* MAXC2NRMK is non-negative. Similarly, there is no need */ +/* to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */ +/* non-negative. */ +/* We need to check the condition only if the */ +/* column index (same as row index) of the original whole */ +/* matrix is larger than 1, since the condition for whole */ +/* original matrix is checked in the main routine. */ + + *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm; + + if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig; */ + + *kb = k - 1; + if__ = i__ - 1; + +/* Apply the block reflector to the residual of the */ +/* matrix A and the residual of the right hand sides B, if */ +/* the residual matrix and and/or the residual of the right */ +/* hand sides exist, i.e. if the submatrix */ +/* A(I+1:M,KB+1:N+NRHS) exists. This occurs when */ +/* KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, & + i__2, kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[* + kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb + + 1) * a_dim1], lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Set TAUs corresponding to the columns that were not */ +/* factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */ +/* which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */ + + i__1 = minmnfact; + for (j = k; j <= i__1; ++j) { + i__2 = j; + tau[i__2].r = 0., tau[i__2].i = 0.; + } + +/* Return from the routine. */ + + return 0; + + } + +/* ============================================================ */ + +/* End ELSE of IF(I.EQ.1) */ + + } + +/* =============================================================== */ + +/* If the pivot column is not the first column of the */ +/* subblock A(1:M,K:N): */ +/* 1) swap the K-th column and the KP-th pivot column */ +/* in A(1:M,1:N); */ +/* 2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */ +/* 3) copy the K-th element into the KP-th element of the partial */ +/* and exact 2-norm vectors VN1 and VN2. (Swap is not needed */ +/* for VN1 and VN2 since we use the element with the index */ +/* larger than K in the next loop step.) */ +/* 4) Save the pivot interchange with the indices relative to the */ +/* the original matrix A_orig, not the block A(1:M,1:N). */ + + if (kp != k) { + zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + zswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf); + vn1[kp] = vn1[k]; + vn2[kp] = vn2[k]; + itemp = jpiv[kp]; + jpiv[kp] = jpiv[k]; + jpiv[k] = itemp; + } + +/* Apply previous Householder reflectors to column K: */ +/* A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */ + + if (k > 1) { + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + d_cnjg(&z__1, &f[k + j * f_dim1]); + f[i__2].r = z__1.r, f[i__2].i = z__1.i; + } + i__1 = *m - i__ + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = 0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[i__ + a_dim1], lda, + &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1); + i__1 = k - 1; + for (j = 1; j <= i__1; ++j) { + i__2 = k + j * f_dim1; + d_cnjg(&z__1, &f[k + j * f_dim1]); + f[i__2].r = z__1.r, f[i__2].i = z__1.i; + } + } + +/* Generate elementary reflector H(k) using the column A(I:M,K). */ + + if (i__ < *m) { + i__1 = *m - i__ + 1; + zlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], & + c__1, &tau[k]); + } else { + i__1 = k; + tau[i__1].r = 0., tau[i__1].i = 0.; + } + +/* Check if TAU(K) contains NaN, set INFO parameter */ +/* to the column number where NaN is found and return from */ +/* the routine. */ +/* NOTE: There is no need to check TAU(K) for Inf, */ +/* since ZLARFG cannot produce TAU(KK) or Householder vector */ +/* below the diagonal containing Inf. Only BETA on the diagonal, */ +/* returned by ZLARFG can contain Inf, which requires */ +/* TAU(K) to contain NaN. Therefore, this case of generating Inf */ +/* by ZLARFG is covered by checking TAU(K) for NaN. */ + + i__1 = k; + d__1 = tau[i__1].r; + if (disnan_(&d__1)) { + i__1 = k; + taunan = tau[i__1].r; + } else /* if(complicated condition) */ { + d__1 = d_imag(&tau[k]); + if (disnan_(&d__1)) { + taunan = d_imag(&tau[k]); + } else { + taunan = 0.; + } + } + + if (disnan_(&taunan)) { + + *done = TRUE_; + +/* Set KB, the number of factorized partial columns */ +/* that are non-zero in each step in the block, */ +/* i.e. the rank of the factor R. */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig. */ + + *kb = k - 1; + if__ = i__ - 1; + *info = k; + +/* Set MAXC2NRMK and RELMAXC2NRMK to NaN. */ + + *maxc2nrmk = taunan; + *relmaxc2nrmk = taunan; + +/* There is no need to apply the block reflector to the */ +/* residual of the matrix A stored in A(KB+1:M,KB+1:N), */ +/* since the submatrix contains NaN and we stop */ +/* the computation. */ +/* But, we need to apply the block reflector to the residual */ +/* right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */ +/* residual right hand sides exist. This occurs */ +/* when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */ + +/* A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */ +/* A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */ + + if (*nrhs > 0 && *kb < *m - *ioffset) { + i__1 = *m - if__; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb, + &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1], + lda); + } + +/* There is no need to recompute the 2-norm of the */ +/* difficult columns, since we stop the factorization. */ + +/* Array TAU(KF+1:MINMNFACT) is not set and contains */ +/* undefined elements. */ + +/* Return from the routine. */ + + return 0; + } + +/* =============================================================== */ + + i__1 = i__ + k * a_dim1; + aik.r = a[i__1].r, aik.i = a[i__1].i; + i__1 = i__ + k * a_dim1; + a[i__1].r = 1., a[i__1].i = 0.; + +/* =============================================================== */ + +/* Compute the current K-th column of F: */ +/* 1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */ + + if (k < *n + *nrhs) { + i__1 = *m - i__ + 1; + i__2 = *n + *nrhs - k; + zgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, & + f[k + 1 + k * f_dim1], &c__1); + } + +/* 2) Zero out elements above and on the diagonal of the */ +/* column K in matrix F, i.e elements F(1:K,K). */ + + i__1 = k; + for (j = 1; j <= i__1; ++j) { + i__2 = j + k * f_dim1; + f[i__2].r = 0., f[i__2].i = 0.; + } + +/* 3) Incremental updating of the K-th column of F: */ +/* F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */ +/* * A(I:M,K). */ + + if (k > 1) { + i__1 = *m - i__ + 1; + i__2 = k - 1; + i__3 = k; + z__1.r = -tau[i__3].r, z__1.i = -tau[i__3].i; + zgemv_("Conjugate Transpose", &i__1, &i__2, &z__1, &a[i__ + + a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1] + , &c__1); + + i__1 = *n + *nrhs; + i__2 = k - 1; + zgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, & + auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1); + } + +/* =============================================================== */ + +/* Update the current I-th row of A: */ +/* A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */ +/* - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */ + + if (k < *n + *nrhs) { + i__1 = *n + *nrhs - k; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, & + z__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, & + c_b2, &a[i__ + (k + 1) * a_dim1], lda); + } + + i__1 = i__ + k * a_dim1; + a[i__1].r = aik.r, a[i__1].i = aik.i; + +/* Update the partial column 2-norms for the residual matrix, */ +/* only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */ +/* when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */ + + if (k < minmnfact) { + + i__1 = *n; + for (j = k + 1; j <= i__1; ++j) { + if (vn1[j] != 0.) { + +/* NOTE: The following lines follow from the analysis in */ +/* Lapack Working Note 176. */ + + temp = z_abs(&a[i__ + j * a_dim1]) / vn1[j]; +/* Computing MAX */ + d__1 = 0., d__2 = (temp + 1.) * (1. - temp); + temp = f2cmax(d__1,d__2); +/* Computing 2nd power */ + d__1 = vn1[j] / vn2[j]; + temp2 = temp * (d__1 * d__1); + if (temp2 <= tol3z) { + +/* At J-index, we have a difficult column for the */ +/* update of the 2-norm. Save the index of the previous */ +/* difficult column in IWORK(J-1). */ +/* NOTE: ILSTCC > 1, threfore we can use IWORK only */ +/* with N-1 elements, where the elements are */ +/* shifted by 1 to the left. */ + + iwork[j - 1] = lsticc; + +/* Set the index of the last difficult column LSTICC. */ + + lsticc = j; + + } else { + vn1[j] *= sqrt(temp); + } + } + } + + } + +/* End of while loop. */ + + } + +/* Now, afler the loop: */ +/* Set KB, the number of factorized columns in the block; */ +/* Set IF, the number of processed rows in the block, which */ +/* is the same as the number of processed rows in */ +/* the original whole matrix A_orig, IF = IOFFSET + KB. */ + + *kb = k; + if__ = i__; + +/* Apply the block reflector to the residual of the matrix A */ +/* and the residual of the right hand sides B, if the residual */ +/* matrix and and/or the residual of the right hand sides */ +/* exist, i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */ +/* This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */ + +/* A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */ +/* A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */ + + if (*kb < minmnupdt) { + i__1 = *m - if__; + i__2 = *n + *nrhs - *kb; + z__1.r = -1., z__1.i = 0.; + zgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &z__1, + &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2, + &a[if__ + 1 + (*kb + 1) * a_dim1], lda); + } + +/* Recompute the 2-norm of the difficult columns. */ +/* Loop over the index of the difficult columns from the largest */ +/* to the smallest index. */ + + while(lsticc > 0) { + +/* LSTICC is the index of the last difficult column is greater */ +/* than 1. */ +/* ITEMP is the index of the previous difficult column. */ + + itemp = iwork[lsticc - 1]; + +/* Compute the 2-norm explicilty for the last difficult column and */ +/* save it in the partial and exact 2-norm vectors VN1 and VN2. */ + +/* NOTE: The computation of VN1( LSTICC ) relies on the fact that */ +/* DZNRM2 does not fail on vectors with norm below the value of */ +/* SQRT(DLAMCH('S')) */ + + i__1 = *m - if__; + vn1[lsticc] = dznrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1); + vn2[lsticc] = vn1[lsticc]; + +/* Downdate the index of the last difficult column to */ +/* the index of the previous difficult column. */ + + lsticc = itemp; + + } + + return 0; + +/* End of ZLAQP3RK */ + +} /* zlaqp3rk_ */ + From 20a2a83f498ada982c431bd34da39181d2785ba1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Nov 2023 12:18:15 +0100 Subject: [PATCH 422/718] Implement truncated QR with pivoting (Reference-LAPACK PR 891) --- lapack-netlib/SRC/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 448fbd8df..de2242701 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -136,7 +136,7 @@ SLASRC_O = \ sgebrd.o sgecon.o sgeequ.o sgees.o sgeesx.o sgeev.o sgeevx.o \ sgehd2.o sgehrd.o sgelq2.o sgelqf.o \ sgels.o sgelsd.o sgelss.o sgelsy.o sgeql2.o sgeqlf.o \ - sgeqp3.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \ + sgeqp3.o sgeqp3rk.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \ sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesv.o sgesvd.o sgesvdx.o sgesvx.o \ sgetc2.o sgetf2.o sgetri.o \ sggbak.o sggbal.o sgges.o sgges3.o sggesx.o \ @@ -151,7 +151,7 @@ SLASRC_O = \ slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \ slansy.o slantb.o slantp.o slantr.o slanv2.o \ slapll.o slapmt.o \ - slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ + slaqgb.o slaqge.o slaqp2.o slaqp2rk.o slaqp3rk.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ @@ -232,7 +232,7 @@ CLASRC_O = \ cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \ cgecon.o cgeequ.o cgees.o cgeesx.o cgeev.o cgeevx.o \ cgehd2.o cgehrd.o cgelq2.o cgelqf.o \ - cgels.o cgelsd.o cgelss.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \ + cgels.o cgelsd.o cgelss.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o cgeqp3rk.o \ cgeqr2.o cgeqr2p.o cgeqrf.o cgeqrfp.o cgerfs.o \ cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesv.o cgesvd.o cgesvdx.o \ cgesvj.o cgejsv.o cgsvj0.o cgsvj1.o \ @@ -266,7 +266,7 @@ CLASRC_O = \ clanhb.o clanhe.o \ clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \ clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \ - claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ + claqhb.o claqhe.o claqhp.o claqp2.o claqp2rk.o claqp3rk.o claqps.o claqsb.o \ claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ claqz0.o claqz1.o claqz2.o claqz3.o \ @@ -345,7 +345,7 @@ DLASRC_O = \ dgebrd.o dgecon.o dgeequ.o dgees.o dgeesx.o dgeev.o dgeevx.o \ dgehd2.o dgehrd.o dgelq2.o dgelqf.o \ dgels.o dgelsd.o dgelss.o dgelsy.o dgeql2.o dgeqlf.o \ - dgeqp3.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \ + dgeqp3.o dgeqp3rk.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \ dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesv.o dgesvd.o dgesvdx.o dgesvx.o \ dgetc2.o dgetf2.o dgetrf.o dgetri.o \ dgetrs.o dggbak.o dggbal.o dgges.o dgges3.o dggesx.o \ @@ -360,7 +360,7 @@ DLASRC_O = \ dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \ dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \ dlapll.o dlapmt.o \ - dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ + dlaqgb.o dlaqge.o dlaqp2.o dlaqp2rk.o dlaqp3rk.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ @@ -437,7 +437,7 @@ ZLASRC_O = \ zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \ zgecon.o zgeequ.o zgees.o zgeesx.o zgeev.o zgeevx.o \ zgehd2.o zgehrd.o zgelq2.o zgelqf.o \ - zgels.o zgelsd.o zgelss.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \ + zgels.o zgelsd.o zgelss.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o zgeqp3rk.o \ zgeqr2.o zgeqr2p.o zgeqrf.o zgeqrfp.o zgerfs.o zgerq2.o zgerqf.o \ zgesc2.o zgesdd.o zgesv.o zgesvd.o zgesvdx.o \ zgesvj.o zgejsv.o zgsvj0.o zgsvj1.o \ @@ -473,7 +473,7 @@ ZLASRC_O = \ zlanhe.o \ zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \ zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \ - zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ + zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqp2rk.o zlaqp3rk.o zlaqps.o zlaqsb.o \ zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ zlaqz0.o zlaqz1.o zlaqz2.o zlaqz3.o \ From 8b2a9568905b6480b457fb8873a7dff7b116a9fc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Nov 2023 14:20:12 +0100 Subject: [PATCH 423/718] Implement truncated QR with pivot (Reference-LAPACK PR 891) --- lapack-netlib/SRC/ilaenv.c | 83 +++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 20 deletions(-) diff --git a/lapack-netlib/SRC/ilaenv.c b/lapack-netlib/SRC/ilaenv.c index c47224a0c..8f3b2db8e 100644 --- a/lapack-netlib/SRC/ilaenv.c +++ b/lapack-netlib/SRC/ilaenv.c @@ -191,7 +191,7 @@ typedef struct Namelist Namelist; #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } #ifdef _MSC_VER #define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} -#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} #else #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} @@ -252,11 +252,11 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} #define myexit_() break; -#define mycycle() continue; -#define myceiling(w) {ceil(w)} -#define myhuge(w) {HUGE_VAL} +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} -#define mymaxloc(w,s,e,n) {dmaxloc_(w,*(s),*(e),n)} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) /* procedure parameter types for -A and -C++ */ @@ -509,12 +509,18 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ +/* -- translated by f2c (version 20000121). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + + /* Table of constant values */ static integer c__1 = 1; -static real c_b174 = 0.f; -static real c_b175 = 1.f; +static real c_b179 = 0.f; +static real c_b180 = 1.f; static integer c__0 = 0; /* > \brief \b ILAENV */ @@ -599,9 +605,9 @@ f"> */ /* > = 9: maximum size of the subproblems at the bottom of the */ /* > computation tree in the divide-and-conquer algorithm */ /* > (used by xGELSD and xGESDD) */ -/* > =10: ieee NaN arithmetic can be trusted not to trap */ +/* > =10: ieee infinity and NaN arithmetic can be trusted not to trap */ /* > =11: infinity arithmetic can be trusted not to trap */ -/* > 12 <= ISPEC <= 16: */ +/* > 12 <= ISPEC <= 17: */ /* > xHSEQR or related subroutines, */ /* > see IPARMQ for detailed explanation */ /* > \endverbatim */ @@ -652,9 +658,7 @@ f"> */ /* > \author Univ. of Colorado Denver */ /* > \author NAG Ltd. */ -/* > \date November 2019 */ - -/* > \ingroup OTHERauxiliary */ +/* > \ingroup ilaenv */ /* > \par Further Details: */ /* ===================== */ @@ -685,7 +689,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1, opts_len) { /* System generated locals */ - integer ret_val; + integer ret_val, i__1, i__2, i__3; /* Local variables */ logical twostage; @@ -702,10 +706,9 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1, integer *, integer *); -/* -- LAPACK auxiliary routine (version 3.9.0) -- */ +/* -- LAPACK auxiliary routine -- */ /* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ /* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ -/* November 2019 */ /* ===================================================================== */ @@ -728,6 +731,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1, case 14: goto L160; case 15: goto L160; case 16: goto L160; + case 17: goto L160; } /* Invalid value for ISPEC */ @@ -908,6 +912,12 @@ L50: } else { nb = 64; } + } else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) { + if (sname) { + nb = 32; + } else { + nb = 32; + } } } else if (s_cmp(c2, "PO", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) { @@ -1034,6 +1044,21 @@ L50: } else { nb = 64; } + } else if (s_cmp(c3, "SYL", (ftnlen)3, (ftnlen)3) == 0) { +/* The upper bound is to prevent overly aggressive scaling. */ + if (sname) { +/* Computing MIN */ +/* Computing MAX */ + i__2 = 48, i__3 = (f2cmin(*n1,*n2) << 4) / 100; + i__1 = f2cmax(i__2,i__3); + nb = f2cmin(i__1,240); + } else { +/* Computing MIN */ +/* Computing MAX */ + i__2 = 24, i__3 = (f2cmin(*n1,*n2) << 3) / 100; + i__1 = f2cmax(i__2,i__3); + nb = f2cmin(i__1,80); + } } } else if (s_cmp(c2, "LA", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "UUM", (ftnlen)3, (ftnlen)3) == 0) { @@ -1042,6 +1067,12 @@ L50: } else { nb = 64; } + } else if (s_cmp(c3, "TRS", (ftnlen)3, (ftnlen)3) == 0) { + if (sname) { + nb = 32; + } else { + nb = 32; + } } } else if (sname && s_cmp(c2, "ST", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "EBZ", (ftnlen)3, (ftnlen)3) == 0) { @@ -1093,6 +1124,12 @@ L60: } else { nbmin = 2; } + } else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) { + if (sname) { + nbmin = 2; + } else { + nbmin = 2; + } } } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) { if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) { @@ -1184,6 +1221,12 @@ L70: } else { nx = 128; } + } else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) { + if (sname) { + nx = 128; + } else { + nx = 128; + } } } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) { if (sname && s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) { @@ -1270,29 +1313,29 @@ L130: L140: -/* ISPEC = 10: ieee NaN arithmetic can be trusted not to trap */ +/* ISPEC = 10: ieee and infinity NaN arithmetic can be trusted not to trap */ /* ILAENV = 0 */ ret_val = 1; if (ret_val == 1) { - ret_val = ieeeck_(&c__1, &c_b174, &c_b175); + ret_val = ieeeck_(&c__1, &c_b179, &c_b180); } return ret_val; L150: -/* ISPEC = 11: infinity arithmetic can be trusted not to trap */ +/* ISPEC = 11: ieee infinity arithmetic can be trusted not to trap */ /* ILAENV = 0 */ ret_val = 1; if (ret_val == 1) { - ret_val = ieeeck_(&c__0, &c_b174, &c_b175); + ret_val = ieeeck_(&c__0, &c_b179, &c_b180); } return ret_val; L160: -/* 12 <= ISPEC <= 16: xHSEQR or related subroutines. */ +/* 12 <= ISPEC <= 17: xHSEQR or related subroutines. */ ret_val = iparmq_(ispec, name__, opts, n1, n2, n3, n4) ; From ee47e4e494629b29d39557d4415349566060a8e6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Nov 2023 15:21:32 +0100 Subject: [PATCH 424/718] run m1/llvm/cmake buid on all 4 cores --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 6c2baf8a0..b4c4870d0 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -29,7 +29,7 @@ task: - mkdir build - cd build - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. - - make + - make -j 4 task: name: AppleM1/GCC/MAKE/OPENMP From cc622f2406dc7f2ca47af27db5eb3bd57ab46ae0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Nov 2023 22:51:09 +0100 Subject: [PATCH 425/718] restore OpenBLAS-specific target_link_libraries --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index e28818c76..143fd0597 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -239,7 +239,8 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} ${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) + target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) +#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() if(BUILD_SINGLE) From f8ad5344c210960fc399ca5b0ad8559ab5ca253e Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Fri, 17 Nov 2023 23:49:34 +0000 Subject: [PATCH 426/718] Fix casum fallback kernel. This kernel is only used on Skylake+ if the kernel with AVX512 intrinsics can't be used, but used the variable x1 incorrectly in the tail end of the loop, as it is still at the initial value instead of where x points to. This caused 55 "other error"s in the LAPACK tests (https://github.com/OpenMathLib/OpenBLAS/issues/4282) This change makes casum.c as similar as possible as zasum.c, because zasum.c does this correctly. --- kernel/x86_64/casum.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c index e4d054311..28d78d279 100644 --- a/kernel/x86_64/casum.c +++ b/kernel/x86_64/casum.c @@ -9,12 +9,12 @@ #endif #ifndef HAVE_CASUM_KERNEL -static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) +static FLOAT casum_kernel(BLASLONG n, FLOAT *x) { BLASLONG i=0; BLASLONG n_8 = n & -8; - FLOAT *x = x1; + FLOAT *x1 = x; FLOAT temp0, temp1, temp2, temp3; FLOAT temp4, temp5, temp6, temp7; FLOAT sum0 = 0.0; @@ -24,14 +24,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) FLOAT sum4 = 0.0; while (i < n_8) { - temp0 = ABS_K(x[0]); - temp1 = ABS_K(x[1]); - temp2 = ABS_K(x[2]); - temp3 = ABS_K(x[3]); - temp4 = ABS_K(x[4]); - temp5 = ABS_K(x[5]); - temp6 = ABS_K(x[6]); - temp7 = ABS_K(x[7]); + temp0 = ABS_K(x1[0]); + temp1 = ABS_K(x1[1]); + temp2 = ABS_K(x1[2]); + temp3 = ABS_K(x1[3]); + temp4 = ABS_K(x1[4]); + temp5 = ABS_K(x1[5]); + temp6 = ABS_K(x1[6]); + temp7 = ABS_K(x1[7]); sum0 += temp0; sum1 += temp1; @@ -43,12 +43,12 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) sum2 += temp6; sum3 += temp7; - x+=8; + x1+=8; i+=4; } while (i < n) { - sum4 += (ABS_K(x1[0]) + ABS_K(x1[1])); + sum4 += ABS_K(x1[0]) + ABS_K(x1[1]); x1 += 2; i++; } From 47b03fd4b4ce7bc51d5b56397e52e6da3c5f3f36 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Nov 2023 23:45:02 +0100 Subject: [PATCH 427/718] Copy XCode15-specific workaround to Fortran flags to fix build of tests --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index 1b84195e4..ff06e503c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -407,6 +407,7 @@ XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/vers endif ifeq (x$(XCVER), x 15) CCOMMON_OPT += -Wl,-ld_classic +FCOMMON_OPT += -Wl,-ld_classic endif endif From 22aa401656032c70c4c0efb8d24bf1a63577c992 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Nov 2023 00:04:31 +0100 Subject: [PATCH 428/718] Temporarily disable the AVX512 CASUM/ZASUM microkernels for any version of NVIDIA HPC (#4327) * Temporarily disable the C/ZASUM microkernels for any version of NVHPC --- kernel/x86_64/casum_microk_skylakex-2.c | 3 ++- kernel/x86_64/zasum_microk_skylakex-2.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index d261962de..5d37fd541 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -4,7 +4,8 @@ #endif #if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) -#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) +#if (!(defined(__NVCOMPILER) )) +//&& NVCOMPVERS < 2309)) #define HAVE_CASUM_KERNEL 1 diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index dddf03fe2..7260922e7 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -4,7 +4,8 @@ #endif #if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) -#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) +#if (!(defined(__NVCOMPILER) )) +//&& NVCOMPVERS < 2309)) #define HAVE_ZASUM_KERNEL 1 From 6b2651ece32365201793b02e78e198abd1f137ac Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 19 Nov 2023 02:57:13 -0600 Subject: [PATCH 429/718] Fix building test_sbgemm --- test/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/Makefile b/test/Makefile index 715842b4d..56acf1c5b 100644 --- a/test/Makefile +++ b/test/Makefile @@ -260,6 +260,7 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) +CLDFLAGS = $(CFLAGS) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) @@ -331,7 +332,7 @@ endif ifeq ($(BUILD_BFLOAT16),1) test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) - $(CC) $(CFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + $(CC) $(CLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif ifeq ($(BUILD_COMPLEX),1) From c34e2cf380a757621f166e6ce8ea8755155158f2 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Sun, 19 Nov 2023 21:21:23 +0000 Subject: [PATCH 430/718] Use _mm_set1_epi{32,64x} to init mask in x86-64 [cz]asum for skylake kernels. This is the same method as used in [sd]asum. _mm_set1_epi64x was commented out for zasum, but has the advantage of avoiding possible undefined behaviour (using an uninitialized variable), optimized out by NVHPC and icx. The new code works fine with those compilers. For GCC 12.3 the generated code is identical; no matter what method you use, the compiler optimizes the code into a compile-time constant, there is no performance benefit using mm_cmpeq_epi8 since the corresponding instruction (VPCMPEQB) isn't actually generated! --- kernel/x86_64/casum_microk_skylakex-2.c | 10 ++++------ kernel/x86_64/zasum_microk_skylakex-2.c | 11 ++++------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index 5d37fd541..10b70ff20 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -2,10 +2,9 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) -#if (!(defined(__NVCOMPILER) )) -//&& NVCOMPVERS < 2309)) +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) #define HAVE_CASUM_KERNEL 1 @@ -21,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) if (n2 < 64) { __m128 accum_10, accum_11, accum_12, accum_13; - __m128 abs_mask1 = abs_mask1; + __m128 abs_mask1; accum_10 = _mm_setzero_ps(); accum_11 = _mm_setzero_ps(); accum_12 = _mm_setzero_ps(); accum_13 = _mm_setzero_ps(); - abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); - abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1); + abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff); _mm_prefetch(&x1[0], _MM_HINT_T0); diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index 7260922e7..f6bc8e37b 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -2,10 +2,9 @@ #ifdef __NVCOMPILER #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) #endif -#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) -#if (!(defined(__NVCOMPILER) )) -//&& NVCOMPVERS < 2309)) +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) #define HAVE_ZASUM_KERNEL 1 @@ -22,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) if (n2 < 32) { __m128d accum_10, accum_11, accum_12, accum_13; - __m128d abs_mask1 = abs_mask1; + __m128d abs_mask1; accum_10 = _mm_setzero_pd(); accum_11 = _mm_setzero_pd(); accum_12 = _mm_setzero_pd(); accum_13 = _mm_setzero_pd(); - // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); - abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); - abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1); + abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); _mm_prefetch(&x1[0], _MM_HINT_T0); if (n2 >= 16){ From 54be8f4d67185c8bf4b7f0737504130e0fa53e83 Mon Sep 17 00:00:00 2001 From: Yuning Zhang Date: Mon, 20 Nov 2023 13:28:25 -0800 Subject: [PATCH 431/718] Update the list of default dynamic targets for x86_64 in the README to be consistent with the Makefile Signed-off-by: Yuning Zhang --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3c4e38f18..aaadd0d9c 100644 --- a/README.md +++ b/README.md @@ -202,7 +202,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. -For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default. +For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX, Cooper Lake, Sapphire Rapids. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default. `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. From 47da601a2d456d1b85b8261e6b1d21a1c2c7e315 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Mon, 20 Nov 2023 17:24:22 -0600 Subject: [PATCH 432/718] POWER: Fixing Makefile error Recent commit d99aad8ee308600832da39105a6511275cfe32ad added extra `)`. This patch fixes the warning from Makefile. --- Makefile.power | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.power b/Makefile.power index aa1ca080a..3fa6d6faf 100644 --- a/Makefile.power +++ b/Makefile.power @@ -11,7 +11,7 @@ endif ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) -ifeq ($(C_COMPILER), GCC)) +ifeq ($(C_COMPILER), GCC) ifeq ($(GCCVERSIONGTEQ10), 1) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math else ifneq ($(GCCVERSIONGT4), 1) From c883abf838f33354f78f044ea9945278558173e8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Nov 2023 22:41:53 +0100 Subject: [PATCH 433/718] Revert unintentional change to linking rule from PR 4322 --- lapack-netlib/TESTING/LIN/Makefile | 40 +++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 46e096c2f..714efa52a 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -269,35 +269,35 @@ proto-double: xlintstds xlintstrfd proto-complex: xlintstrfc proto-complex16: xlintstzc xlintstrfz -xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstds: $(DSLINTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstds: $(DSLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstzc: $(ZCLINTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstzc: $(ZCLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfs: $(SLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstrfs: $(SLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfd: $(DLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstrfd: $(DLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfc: $(CLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstrfc: $(CLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xlintstrfz: $(ZLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xlintstrfz: $(ZLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB) + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ $(ALINTST): $(FRC) $(SCLNTST): $(FRC) From f745f02f35641061511a3b2c368a21ef374c56f4 Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Fri, 24 Nov 2023 14:51:37 +0800 Subject: [PATCH 434/718] benchmark: Fix missing colons in outputs of ./strsv.goto --- benchmark/trsv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/trsv.c b/benchmark/trsv.c index 66ac3a3c7..e17c57157 100644 --- a/benchmark/trsv.c +++ b/benchmark/trsv.c @@ -127,7 +127,7 @@ int main(int argc, char *argv[]){ long long muls = n*(n+1)/2.0; long long adds = (n - 1.0)*n/2.0; - fprintf(stderr, "%10d %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg); + fprintf(stderr, "%10d : %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg); if(a != NULL){ free(a); } From ca5a87ff1de25fd9d6dd22f4b4c635ec040fe59d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Nov 2023 15:31:18 +0100 Subject: [PATCH 435/718] Small documentation fix for Truncated QR With Pivoting (Reference-LAPACK PR 941) --- lapack-netlib/SRC/cgeqp3rk.f | 12 ++++++------ lapack-netlib/SRC/claqp2rk.f | 6 +++--- lapack-netlib/SRC/claqp3rk.f | 6 +++--- lapack-netlib/SRC/dgeqp3rk.f | 4 ++-- lapack-netlib/SRC/dlaqp2rk.f | 4 ++-- lapack-netlib/SRC/dlaqp3rk.f | 4 ++-- lapack-netlib/SRC/sgeqp3rk.f | 4 ++-- lapack-netlib/SRC/slaqp2rk.f | 4 ++-- lapack-netlib/SRC/slaqp3rk.f | 4 ++-- lapack-netlib/SRC/zgeqp3rk.f | 12 ++++++------ lapack-netlib/SRC/zlaqp2rk.f | 6 +++--- lapack-netlib/SRC/zlaqp3rk.f | 6 +++--- 12 files changed, 36 insertions(+), 36 deletions(-) diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f index 70789e64f..587860684 100644 --- a/lapack-netlib/SRC/cgeqp3rk.f +++ b/lapack-netlib/SRC/cgeqp3rk.f @@ -55,7 +55,7 @@ *> where: *> *> P(K) is an N-by-N permutation matrix; -*> Q(K) is an M-by-M orthogonal matrix; +*> Q(K) is an M-by-M unitary matrix; *> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the *> full rank factor R with K-by-K upper-triangular *> R11(K) and K-by-N rectangular R12(K). The diagonal @@ -124,14 +124,14 @@ *> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum *> column 2-norm of the original matrix A, which is equal *> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); -*> e) Q(K)**H * B, the matrix B with the orthogonal +*> e) Q(K)**H * B, the matrix B with the unitary *> transformation Q(K)**H applied on the left. *> *> The N-by-N permutation matrix P(K) is stored in a compact form in *> the integer array JPIV. For 1 <= j <= N, column j *> of the matrix A was interchanged with column JPIV(j). *> -*> The M-by-M orthogonal matrix Q is represented as a product +*> The M-by-M unitary matrix Q is represented as a product *> of elementary Householder reflectors *> *> Q(K) = H(1) * H(2) * . . . * H(K), @@ -300,7 +300,7 @@ *> *> 1. The elements below the diagonal of the subarray *> A(1:M,1:K) together with TAU(1:K) represent the -*> orthogonal matrix Q(K) as a product of K Householder +*> unitary matrix Q(K) as a product of K Householder *> elementary reflectors. *> *> 2. The elements on and above the diagonal of @@ -579,8 +579,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/claqp2rk.f b/lapack-netlib/SRC/claqp2rk.f index 073ad0f88..6b1db085a 100644 --- a/lapack-netlib/SRC/claqp2rk.f +++ b/lapack-netlib/SRC/claqp2rk.f @@ -178,7 +178,7 @@ *> On exit: *> 1. The elements in block A(IOFFSET+1:M,1:K) below *> the diagonal together with the array TAU represent -*> the orthogonal matrix Q(K) as a product of elementary +*> the unitary matrix Q(K) as a product of elementary *> reflectors. *> 2. The upper triangular block of the matrix A stored *> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. @@ -332,8 +332,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/claqp3rk.f b/lapack-netlib/SRC/claqp3rk.f index af5e85645..3703bcbd6 100644 --- a/lapack-netlib/SRC/claqp3rk.f +++ b/lapack-netlib/SRC/claqp3rk.f @@ -196,7 +196,7 @@ *> On exit: *> 1. The elements in block A(IOFFSET+1:M,1:KB) below *> the diagonal together with the array TAU represent -*> the orthogonal matrix Q(KB) as a product of elementary +*> the unitary matrix Q(KB) as a product of elementary *> reflectors. *> 2. The upper triangular block of the matrix A stored *> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. @@ -383,8 +383,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f index ace97b712..117a68287 100644 --- a/lapack-netlib/SRC/dgeqp3rk.f +++ b/lapack-netlib/SRC/dgeqp3rk.f @@ -573,8 +573,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/dlaqp2rk.f b/lapack-netlib/SRC/dlaqp2rk.f index b5a84d0de..aecd6bb69 100644 --- a/lapack-netlib/SRC/dlaqp2rk.f +++ b/lapack-netlib/SRC/dlaqp2rk.f @@ -331,8 +331,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/dlaqp3rk.f b/lapack-netlib/SRC/dlaqp3rk.f index 39e617d0e..8139345ed 100644 --- a/lapack-netlib/SRC/dlaqp3rk.f +++ b/lapack-netlib/SRC/dlaqp3rk.f @@ -389,8 +389,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f index 17559c7f4..bb5da72dc 100644 --- a/lapack-netlib/SRC/sgeqp3rk.f +++ b/lapack-netlib/SRC/sgeqp3rk.f @@ -573,8 +573,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/slaqp2rk.f b/lapack-netlib/SRC/slaqp2rk.f index d3dbb3d7c..f88b0ce90 100644 --- a/lapack-netlib/SRC/slaqp2rk.f +++ b/lapack-netlib/SRC/slaqp2rk.f @@ -331,8 +331,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/slaqp3rk.f b/lapack-netlib/SRC/slaqp3rk.f index fa735bb9d..b2dc2b334 100644 --- a/lapack-netlib/SRC/slaqp3rk.f +++ b/lapack-netlib/SRC/slaqp3rk.f @@ -389,8 +389,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f index f8ef986c7..247a3c379 100644 --- a/lapack-netlib/SRC/zgeqp3rk.f +++ b/lapack-netlib/SRC/zgeqp3rk.f @@ -55,7 +55,7 @@ *> where: *> *> P(K) is an N-by-N permutation matrix; -*> Q(K) is an M-by-M orthogonal matrix; +*> Q(K) is an M-by-M unitary matrix; *> R(K)_approx = ( R11(K), R12(K) ) is a rank K approximation of the *> full rank factor R with K-by-K upper-triangular *> R11(K) and K-by-N rectangular R12(K). The diagonal @@ -124,14 +124,14 @@ *> d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum *> column 2-norm of the original matrix A, which is equal *> to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 ); -*> e) Q(K)**H * B, the matrix B with the orthogonal +*> e) Q(K)**H * B, the matrix B with the unitary *> transformation Q(K)**H applied on the left. *> *> The N-by-N permutation matrix P(K) is stored in a compact form in *> the integer array JPIV. For 1 <= j <= N, column j *> of the matrix A was interchanged with column JPIV(j). *> -*> The M-by-M orthogonal matrix Q is represented as a product +*> The M-by-M unitary matrix Q is represented as a product *> of elementary Householder reflectors *> *> Q(K) = H(1) * H(2) * . . . * H(K), @@ -300,7 +300,7 @@ *> *> 1. The elements below the diagonal of the subarray *> A(1:M,1:K) together with TAU(1:K) represent the -*> orthogonal matrix Q(K) as a product of K Householder +*> unitary matrix Q(K) as a product of K Householder *> elementary reflectors. *> *> 2. The elements on and above the diagonal of @@ -579,8 +579,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/zlaqp2rk.f b/lapack-netlib/SRC/zlaqp2rk.f index f1e9f4899..f6bf555c2 100644 --- a/lapack-netlib/SRC/zlaqp2rk.f +++ b/lapack-netlib/SRC/zlaqp2rk.f @@ -178,7 +178,7 @@ *> On exit: *> 1. The elements in block A(IOFFSET+1:M,1:K) below *> the diagonal together with the array TAU represent -*> the orthogonal matrix Q(K) as a product of elementary +*> the unitary matrix Q(K) as a product of elementary *> reflectors. *> 2. The upper triangular block of the matrix A stored *> in A(IOFFSET+1:M,1:K) is the triangular factor obtained. @@ -332,8 +332,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * diff --git a/lapack-netlib/SRC/zlaqp3rk.f b/lapack-netlib/SRC/zlaqp3rk.f index 7a9fdfd95..0dd8bf8e3 100644 --- a/lapack-netlib/SRC/zlaqp3rk.f +++ b/lapack-netlib/SRC/zlaqp3rk.f @@ -196,7 +196,7 @@ *> On exit: *> 1. The elements in block A(IOFFSET+1:M,1:KB) below *> the diagonal together with the array TAU represent -*> the orthogonal matrix Q(KB) as a product of elementary +*> the unitary matrix Q(KB) as a product of elementary *> reflectors. *> 2. The upper triangular block of the matrix A stored *> in A(IOFFSET+1:M,1:KB) is the triangular factor obtained. @@ -383,8 +383,8 @@ *> \verbatim *> *> November 2023, Igor Kozachenko, James Demmel, -*> Computer Science Division, -*> University of California, Berkeley +*> EECS Department, +*> University of California, Berkeley, USA. *> *> \endverbatim * From f7351e493c6ce928e5c924c15d709c3de7c62d7b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Nov 2023 18:49:34 +0100 Subject: [PATCH 436/718] Update Reference-LAPACK docs to 3.12.0 --- lapack-netlib/DOCS/Doxyfile | 478 +++++--- lapack-netlib/DOCS/DoxygenLayout.xml | 197 ++++ lapack-netlib/DOCS/groups-usr.dox | 1575 ++++++++++++++++---------- 3 files changed, 1480 insertions(+), 770 deletions(-) create mode 100644 lapack-netlib/DOCS/DoxygenLayout.xml diff --git a/lapack-netlib/DOCS/Doxyfile b/lapack-netlib/DOCS/Doxyfile index 43cea43b5..577675772 100644 --- a/lapack-netlib/DOCS/Doxyfile +++ b/lapack-netlib/DOCS/Doxyfile @@ -1,4 +1,4 @@ -# Doxyfile 1.8.10 +# Doxyfile 1.9.1 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. @@ -17,11 +17,11 @@ # Project related configuration options #--------------------------------------------------------------------------- -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all text -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv -# for the list of possible encodings. +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 @@ -38,7 +38,7 @@ PROJECT_NAME = LAPACK # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 3.9.0 +PROJECT_NUMBER = 3.12.0 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES = NO OUTPUT_LANGUAGE = English +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. @@ -179,6 +187,16 @@ SHORT_NAMES = NO JAVADOC_AUTOBRIEF = NO +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus @@ -199,6 +217,14 @@ QT_AUTOBRIEF = NO MULTILINE_CPP_IS_BRIEF = NO +# By default Python docstrings are displayed as preformatted text and doxygen's +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the +# doxygen's special commands can be used and the contents of the docstring +# documentation blocks is shown as doxygen documentation. +# The default value is: YES. + +PYTHON_DOCSTRING = YES + # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. @@ -226,16 +252,15 @@ TAB_SIZE = 8 # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines. +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) ALIASES = -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all @@ -264,28 +289,40 @@ OPTIMIZE_FOR_FORTRAN = YES OPTIMIZE_OUTPUT_VHDL = NO +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: -# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: -# Fortran. In the later case the parser tries to guess whether the code is fixed -# or free formatted code, this is the default for Fortran type files), VHDL. For -# instance to make doxygen treat .inc files as Fortran files (default is PHP), -# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files). For instance to make doxygen treat .inc files +# as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. +# the files are not read by doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. +# documentation. See https://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. @@ -293,6 +330,15 @@ EXTENSION_MAPPING = MARKDOWN_SUPPORT = YES +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or @@ -318,7 +364,7 @@ BUILTIN_STL_SUPPORT = NO CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. @@ -341,7 +387,7 @@ IDL_PROPERTY_SUPPORT = YES # all members of a group must be documented explicitly. # The default value is: NO. -DISTRIBUTE_GROUP_DOC = YES +DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option @@ -404,6 +450,19 @@ TYPEDEF_HIDES_STRUCT = NO LOOKUP_CACHE_SIZE = 0 +# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use +# during processing. When set to 0 doxygen will based this on the number of +# cores available in the system. You can set it explicitly to a value larger +# than 0 to get more control over the balance between CPU load and processing +# speed. At this moment only the input processing can be done using multiple +# threads. Since this is still an experimental feature the default is set to 1, +# which efficively disables parallel processing. Please report any issues you +# encounter. Generating dot graphs in parallel is controlled by the +# DOT_NUM_THREADS setting. +# Minimum value: 0, maximum value: 32, default value: 1. + +NUM_PROC_THREADS = 1 + #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- @@ -424,6 +483,12 @@ EXTRACT_ALL = YES EXTRACT_PRIVATE = NO +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. @@ -461,6 +526,13 @@ EXTRACT_LOCAL_METHODS = NO EXTRACT_ANON_NSPACES = NO +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation @@ -478,8 +550,8 @@ HIDE_UNDOC_MEMBERS = NO HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# (class|struct|union) declarations. If set to NO, these declarations will be -# included in the documentation. +# declarations. If set to NO, these declarations will be included in the +# documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO @@ -498,11 +570,18 @@ HIDE_IN_BODY_DOCS = NO INTERNAL_DOCS = NO -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and MacOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. # The default value is: system dependent. CASE_SENSE_NAMES = NO @@ -684,12 +763,12 @@ FILE_VERSION_FILTER = # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = DOCS/DoxygenLayout.xml # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. @@ -705,7 +784,7 @@ CITE_BIB_FILES = # messages are off. # The default value is: NO. -QUIET = YES +QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES @@ -734,10 +813,20 @@ WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. # The default value is: NO. -WARN_NO_PARAMDOC = NO +WARN_NO_PARAMDOC = YES + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the doxygen process doxygen will return with a non-zero status. +# Possible values are: NO, YES and FAIL_ON_WARNINGS. +# The default value is: NO. + +WARN_AS_ERROR = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which @@ -753,7 +842,7 @@ WARN_FORMAT = "$file:$line: $text" # messages should be written. If left blank the output is written to standard # error (stderr). -WARN_LOGFILE = output_err +WARN_LOGFILE = doxygen_error #--------------------------------------------------------------------------- # Configuration options related to the input files @@ -762,17 +851,18 @@ WARN_LOGFILE = output_err # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with -# spaces. +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = . \ - DOCS/groups-usr.dox +INPUT = BLAS CBLAS SRC INSTALL TESTING \ + DOCS/groups-usr.dox \ + README.md # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: http://www.gnu.org/software/libiconv) for the list of -# possible encodings. +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 @@ -785,14 +875,19 @@ INPUT_ENCODING = UTF-8 # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. +# # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, -# *.vhdl, *.ucf, *.qsf, *.as and *.js. +# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), +# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl, +# *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.c \ *.f \ + *.f90 \ *.h # The RECURSIVE tag can be used to specify whether or not subdirectories should @@ -808,34 +903,15 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = CMAKE \ - DOCS \ - .svn \ - CBLAS/.svn \ - CBLAS/src/.svn \ - CBLAS/testing/.svn \ - CBLAS/example/.svn \ - CBLAS/include/.svn \ - BLAS/.svn \ - BLAS/SRC/.svn \ - BLAS/TESTING/.svn \ - SRC/.svn \ - SRC/VARIANTS/.svn \ - SRC/VARIANTS/LIB/.svn \ - SRC/VARIANTS/cholesky/.svn \ - SRC/VARIANTS/cholesky/RL/.svn \ - SRC/VARIANTS/cholesky/TOP/.svn \ - SRC/VARIANTS/lu/.svn \ - SRC/VARIANTS/lu/CR/.svn \ - SRC/VARIANTS/lu/LL/.svn \ - SRC/VARIANTS/lu/REC/.svn \ - SRC/VARIANTS/qr/.svn \ - SRC/VARIANTS/qr/LL/.svn \ - INSTALL/.svn \ - TESTING/.svn \ - TESTING/EIG/.svn \ - TESTING/MATGEN/.svn \ - TESTING/LIN/.svn +# Exclude functions that are duplicated, creating conflicts. +EXCLUDE = .git \ + .github \ + SRC/VARIANTS \ + BLAS/SRC/lsame.f \ + BLAS/SRC/xerbla.f \ + BLAS/SRC/xerbla_array.f \ + INSTALL/slamchf77.f \ + INSTALL/dlamchf77.f \ # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -908,6 +984,10 @@ IMAGE_PATH = # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. INPUT_FILTER = @@ -917,6 +997,10 @@ INPUT_FILTER = # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. FILTER_PATTERNS = @@ -969,7 +1053,7 @@ INLINE_SOURCES = YES STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# function all documented functions referencing it will be listed. +# entity all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO @@ -1001,12 +1085,12 @@ SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system -# (see http://www.gnu.org/software/global/global.html). You will need version +# (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # @@ -1028,25 +1112,6 @@ USE_HTAGS = NO VERBATIM_HEADERS = YES -# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the -# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the -# cost of reduced performance. This can be particularly helpful with template -# rich C++ code for which doxygen's built-in parser lacks the necessary type -# information. -# Note: The availability of this option depends on whether or not doxygen was -# compiled with the --with-libclang option. -# The default value is: NO. - -CLANG_ASSISTED_PARSING = NO - -# If clang assisted parsing is enabled you can provide the compiler with command -# line options that you would normally use when invoking the compiler. Note that -# the include paths will already be set by doxygen for the files and directories -# specified with INPUT and INCLUDE_PATH. -# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. - -CLANG_OPTIONS = - #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- @@ -1058,13 +1123,6 @@ CLANG_OPTIONS = ALPHABETICAL_INDEX = YES -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored @@ -1165,7 +1223,7 @@ HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see -# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. @@ -1201,6 +1259,17 @@ HTML_COLORSTYLE_GAMMA = 80 HTML_TIMESTAMP = YES +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. @@ -1224,13 +1293,14 @@ HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: http://developer.apple.com/tools/xcode/), introduced with -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1269,8 +1339,8 @@ DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. +# (see: +# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML @@ -1300,7 +1370,7 @@ CHM_FILE = HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). +# (YES) or that it should be included in the main .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. @@ -1345,7 +1415,8 @@ QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace -# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1353,8 +1424,8 @@ QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- -# folders). +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1362,30 +1433,30 @@ QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: -# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to +# run qhelpgenerator on the generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = @@ -1462,6 +1533,17 @@ TREEVIEW_WIDTH = 250 EXT_LINKS_IN_WINDOW = NO +# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg +# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see +# https://inkscape.org) to generate formulas as SVG images instead of PNGs for +# the HTML output. These images will generally look nicer at scaled resolutions. +# Possible values are: png (the default) and svg (looks nicer but requires the +# pdf2svg or inkscape tool). +# The default value is: png. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FORMULA_FORMAT = png + # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML @@ -1471,7 +1553,7 @@ EXT_LINKS_IN_WINDOW = NO FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # @@ -1482,8 +1564,14 @@ FORMULA_FONTSIZE = 10 FORMULA_TRANSPARENT = YES +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# http://www.mathjax.org) which uses client side Javascript for the rendering +# https://www.mathjax.org) which uses client side JavaScript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path @@ -1495,7 +1583,7 @@ USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. @@ -1510,8 +1598,8 @@ MATHJAX_FORMAT = HTML-CSS # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of -# MathJax from http://www.mathjax.org before deployment. -# The default value is: http://cdn.mathjax.org/mathjax/latest. +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://www.mathjax.org/mathjax @@ -1525,7 +1613,8 @@ MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -1553,7 +1642,7 @@ MATHJAX_CODEFILE = SEARCHENGINE = YES # When the SERVER_BASED_SEARCH tag is enabled the search engine will be -# implemented using a web server instead of a web client using Javascript. There +# implemented using a web server instead of a web client using JavaScript. There # are two flavors of web server based searching depending on the EXTERNAL_SEARCH # setting. When disabled, doxygen will generate a PHP script for searching and # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing @@ -1572,7 +1661,8 @@ SERVER_BASED_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). +# Xapian (see: +# https://xapian.org/). # # See the section "External Indexing and Searching" for details. # The default value is: NO. @@ -1585,8 +1675,9 @@ EXTERNAL_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). See the section "External Indexing and -# Searching" for details. +# Xapian (see: +# https://xapian.org/). See the section "External Indexing and Searching" for +# details. # This tag requires that the tag SEARCHENGINE is set to YES. SEARCHENGINE_URL = @@ -1637,21 +1728,35 @@ LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. # -# Note that when enabling USE_PDFLATEX this option is only used for generating -# bitmaps for formulas in the HTML output, but not in the Makefile that is -# written to the output directory. -# The default file is: latex. +# Note that when not enabling USE_PDFLATEX the default is latex when enabling +# USE_PDFLATEX the default is pdflatex and when in the later case latex is +# chosen this is overwritten by pdflatex. For specific output languages the +# default can have been set differently, this depends on the implementation of +# the output language. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate # index for LaTeX. +# Note: This tag is used in the Makefile / make.bat. +# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file +# (.tex). # The default file is: makeindex. # This tag requires that the tag GENERATE_LATEX is set to YES. MAKEINDEX_CMD_NAME = makeindex +# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to +# generate index for LaTeX. In case there is no backslash (\) as first character +# it will be automatically added in the LaTeX code. +# Note: This tag is used in the generated output file (.tex). +# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat. +# The default value is: makeindex. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_MAKEINDEX_CMD = makeindex + # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX # documents. This may be useful for small projects and may help to save some # trees in general. @@ -1736,9 +1841,11 @@ LATEX_EXTRA_FILES = PDF_HYPERLINKS = YES -# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate -# the PDF file directly from the LaTeX files. Set this option to YES, to get a -# higher quality PDF documentation. +# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as +# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX +# files. Set this option to YES, to get a higher quality PDF documentation. +# +# See also section LATEX_CMD_NAME for selecting the engine. # The default value is: YES. # This tag requires that the tag GENERATE_LATEX is set to YES. @@ -1772,12 +1879,28 @@ LATEX_SOURCE_CODE = NO # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. See -# http://en.wikipedia.org/wiki/BibTeX and \cite for more info. +# https://en.wikipedia.org/wiki/BibTeX and \cite for more info. # The default value is: plain. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_BIB_STYLE = plain +# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated +# page will contain the date and time when the page was generated. Setting this +# to NO can help when comparing the output of multiple runs. +# The default value is: NO. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_TIMESTAMP = NO + +# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) +# path from which the emoji images will be read. If a relative path is entered, +# it will be relative to the LATEX_OUTPUT directory. If left blank the +# LATEX_OUTPUT directory will be used. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_EMOJI_DIRECTORY = + #--------------------------------------------------------------------------- # Configuration options related to the RTF output #--------------------------------------------------------------------------- @@ -1817,9 +1940,9 @@ COMPACT_RTF = NO RTF_HYPERLINKS = YES -# Load stylesheet definitions from file. Syntax is similar to doxygen's config -# file, i.e. a series of assignments. You only have to provide replacements, -# missing definitions are set to their default value. +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# configuration file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. # # See also section "Doxygen usage" for information on how to generate the # default style sheet that doxygen normally uses. @@ -1828,8 +1951,8 @@ RTF_HYPERLINKS = YES RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is -# similar to doxygen's config file. A template extensions file can be generated -# using doxygen -e rtf extensionFile. +# similar to doxygen's configuration file. A template extensions file can be +# generated using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. RTF_EXTENSIONS_FILE = @@ -1915,6 +2038,13 @@ XML_OUTPUT = xml XML_PROGRAMLISTING = YES +# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include +# namespace members in file scope as well, matching the HTML output. +# The default value is: NO. +# This tag requires that the tag GENERATE_XML is set to YES. + +XML_NS_MEMB_FILE_SCOPE = NO + #--------------------------------------------------------------------------- # Configuration options related to the DOCBOOK output #--------------------------------------------------------------------------- @@ -1947,9 +2077,9 @@ DOCBOOK_PROGRAMLISTING = NO #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an -# AutoGen Definitions (see http://autogen.sf.net) file that captures the -# structure of the code including all documentation. Note that this feature is -# still experimental and incomplete at the moment. +# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures +# the structure of the code including all documentation. Note that this feature +# is still experimental and incomplete at the moment. # The default value is: NO. GENERATE_AUTOGEN_DEF = NO @@ -2116,12 +2246,6 @@ EXTERNAL_GROUPS = YES EXTERNAL_PAGES = YES -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of 'which perl'). -# The default file (with absolute path) is: /usr/bin/perl. - -PERL_PATH = /sw/bin/perl - #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- @@ -2135,15 +2259,6 @@ PERL_PATH = /sw/bin/perl CLASS_DIAGRAMS = YES -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see: -# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = - # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. @@ -2241,10 +2356,32 @@ UML_LOOK = NO # but if the number exceeds 15, the total amount of fields shown is limited to # 10. # Minimum value: 0, maximum value: 100, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. +# This tag requires that the tag UML_LOOK is set to YES. UML_LIMIT_NUM_FIELDS = 10 +# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and +# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS +# tag is set to YES, doxygen will add type and arguments for attributes and +# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen +# will not generate fields with class member information in the UML graphs. The +# class diagrams will look similar to the default class diagrams but using UML +# notation for the relationships. +# Possible values are: NO, YES and NONE. +# The default value is: NO. +# This tag requires that the tag UML_LOOK is set to YES. + +DOT_UML_DETAILS = NO + +# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters +# to display on a single line. If the actual line length exceeds this threshold +# significantly it will wrapped across multiple lines. Some heuristics are apply +# to avoid ugly line breaks. +# Minimum value: 0, maximum value: 1000, default value: 17. +# This tag requires that the tag HAVE_DOT is set to YES. + +DOT_WRAP_THRESHOLD = 17 + # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and # collaboration graphs will show the relations between templates and their # instances. @@ -2371,6 +2508,11 @@ DIAFILE_DIRS = PLANTUML_JAR_PATH = +# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a +# configuration file for plantuml. + +PLANTUML_CFG_FILE = + # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. @@ -2429,9 +2571,11 @@ DOT_MULTI_TARGETS = NO GENERATE_LEGEND = YES -# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot +# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate # files that are used to generate the various graphs. +# +# Note: This setting is not only used for dot files but also for msc and +# plantuml temporary files. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. DOT_CLEANUP = YES diff --git a/lapack-netlib/DOCS/DoxygenLayout.xml b/lapack-netlib/DOCS/DoxygenLayout.xml new file mode 100644 index 000000000..aeb346d8d --- /dev/null +++ b/lapack-netlib/DOCS/DoxygenLayout.xml @@ -0,0 +1,197 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lapack-netlib/DOCS/groups-usr.dox b/lapack-netlib/DOCS/groups-usr.dox index 6c31e4cf5..cbd747165 100644 --- a/lapack-netlib/DOCS/groups-usr.dox +++ b/lapack-netlib/DOCS/groups-usr.dox @@ -1,605 +1,974 @@ /** - * - **** - * - * @defgroup lapack LAPACK - * This is the group of LAPACK routines. - * - * @defgroup GE General Matrices - * @ingroup lapack - * This is the group of General Matrices routines - * @defgroup GB General Band Matrix - * @ingroup lapack - * This is the group of General Band routines - * @defgroup SY Symmetric Matrix - * @ingroup lapack - * This is the group of Symmetric Matrix routines - * @defgroup HE Hermitian Matrix - * @ingroup lapack - * This is the group of Hermitian Matrix routines - * @defgroup PO Positive Definite Matrix - * @ingroup lapack - * This is the group of Positive Definite routines - * @defgroup GT General tridiagonal Matrix - * @ingroup lapack - * This is the group of General tridiagonal routines - * @defgroup PT Positive Definite tridiagonal Matrix - * @ingroup lapack - * This is the group of Positive Definite tridiagonal routines - * @defgroup OTHEReigen Eigenvalue - * @ingroup lapack - * This is the group of Other Eigenvalue routines - * @defgroup OTHERauxiliary Other Auxiliary Routines - * @ingroup lapack - * This is the group of Other Auxiliary routines - * @defgroup OTHERcomputational Other Computational Routines - * @ingroup lapack - * This is the group of Other Computational routines - * @defgroup OTHERsolve Other Solve Routines - * @ingroup lapack - * This is the group of Other Solve routines - * - **** - * - * @defgroup solveGE Linear Solve - * @ingroup GE - * This is the group of Linear Solve Driver routines - * @defgroup solveGB Linear Solve - * @ingroup GB - * This is the group of Linear Solve Driver routines - * @defgroup solveSY Linear Solve - * @ingroup SY - * This is the group of Linear Solve Driver routines - * @defgroup solveHE Linear Solve - * @ingroup HE - * This is the group of Linear Solve Driver routines - * @defgroup solvePO Linear Solve - * @ingroup PO - * This is the group of Linear Solve Driver routines - * @defgroup solveGT Linear Solve - * @ingroup GT - * This is the group of Linear Solve Driver routines - * @defgroup solvePT Linear Solve - * @ingroup PT - * This is the group of Linear Solve Driver routines - * @defgroup eigenGE Eigenvalue - * @ingroup GE - * This is the group of Eigenvalue Driver routines - * @defgroup eigenSY Eigenvalue - * @ingroup SY - * This is the group of Eigenvalue Driver routines - * @defgroup eigenHE Eigenvalue - * @ingroup HE - * This is the group of Eigenvalue Driver routines - * @defgroup singGE Singular Value - * @ingroup GE - * This is the group of Singular Value Driver routines - * @defgroup computationalGE Computational routines - * @ingroup GE - * This is the group of Computational routines - * @defgroup variantsGEcomputational Variants Computational routines - * @ingroup GE - * This is the group of Variants Computational routines - * @defgroup computationalGB Computational routines - * @ingroup GB - * This is the group of Computational routines - * @defgroup computationalSY Computational routines - * @ingroup SY - * This is the group of Computational routines - * @defgroup computationalHE Computational routines - * @ingroup HE - * This is the group of Computational routines - * @defgroup computationalPO Computational routines - * @ingroup PO - * This is the group of Computational routines - * @defgroup variantsPOcomputational Variants Computational routines - * @ingroup PO - * This is the group of Variants Computational routines - * @defgroup computationalGT Computational routines - * @ingroup GT - * This is the group of Computational routines - * @defgroup computationalPT Computational routines - * @ingroup PT - * This is the group of Computational routines - * @defgroup variantsOTHERcomputational Variants Computational routines - * @ingroup OTHERcomputational - * This is the group of Variants Computational routines - * @defgroup auxiliaryGE Auxiliary routines - * @ingroup GE - * This is the group of Auxiliary routines - * @defgroup auxiliaryGB Auxiliary routines - * @ingroup GB - * This is the group of Auxiliary routines - * @defgroup auxiliarySY Auxiliary routines - * @ingroup SY - * This is the group of Auxiliary routines - * @defgroup auxiliaryHE Auxiliary routines - * @ingroup HE - * This is the group of Auxiliary routines - * @defgroup auxiliaryPO Auxiliary routines - * @ingroup PO - * This is the group of Auxiliary routines - * @defgroup auxiliaryGT Auxiliary routines - * @ingroup GT - * This is the group of Auxiliary routines - * @defgroup auxiliaryPT Auxiliary routines - * @ingroup PT - * This is the group of Auxiliary routines - * - **** - * - * @defgroup doubleGEsolve double - * @ingroup solveGE - * This is the group of double solve driver functions for GE matrices - * @defgroup doublePOsolve double - * @ingroup solvePO - * This is the group of double solve driver functions for PO matrices - * @defgroup doubleSYsolve double - * @ingroup solveSY - * This is the group of double solve driver functions for SY matrices - * @defgroup doubleGBsolve double - * @ingroup solveGB - * This is the group of double solve driver functions for GB matrices - * @defgroup doubleGTsolve double - * @ingroup solveGT - * This is the group of double solve driver functions for GT matrices - * @defgroup doublePTsolve double - * @ingroup solvePT - * This is the group of double solve driver functions for PT matrices - * @defgroup doubleGEeigen double - * @ingroup eigenGE - * This is the group of double eigenvalue driver functions for GE matrices - * @defgroup doubleSYeigen double - * @ingroup eigenSY - * This is the group of double eigenvalue driver functions for SY matrices - * @defgroup doubleGEsing double - * @ingroup singGE - * This is the group of double singular value driver functions for GE matrices - * @defgroup doubleGEcomputational double - * @ingroup computationalGE - * This is the group of double computational functions for GE matrices - * @defgroup doublePOcomputational double - * @ingroup computationalPO - * This is the group of double computational functions for PO matrices - * @defgroup doubleSYcomputational double - * @ingroup computationalSY - * This is the group of double computational functions for SY matrices - * @defgroup doubleGBcomputational double - * @ingroup computationalGB - * This is the group of double computational functions for GB matrices - * @defgroup doubleGTcomputational double - * @ingroup computationalGT - * This is the group of double computational functions for GT matrices - * @defgroup doublePTcomputational double - * @ingroup computationalPT - * This is the group of double computational functions for PT matrices - * @defgroup doubleGEauxiliary double - * @ingroup auxiliaryGE - * This is the group of double auxiliary functions for GE matrices - * @defgroup doublePOauxiliary double - * @ingroup auxiliaryPO - * This is the group of double auxiliary functions for PO matrices - * @defgroup doubleSYauxiliary double - * @ingroup auxiliarySY - * This is the group of double auxiliary functions for SY matrices - * @defgroup doubleGBauxiliary double - * @ingroup auxiliaryGB - * This is the group of double auxiliary functions for GB matrices - * @defgroup doublePTauxiliary double - * @ingroup auxiliaryPT - * This is the group of double auxiliary functions for PT matrices - * @defgroup doubleGTauxiliary double - * @ingroup auxiliaryGT - * This is the group of double auxiliary functions for GT matrices - * @defgroup doubleOTHERauxiliary double - * @ingroup OTHERauxiliary - * This is the group of double other auxiliary routines - * @defgroup doubleOTHERcomputational double - * @ingroup OTHERcomputational - * This is the group of double other Computational routines - * @defgroup doubleOTHERsolve double Other Solve Routines - * @ingroup OTHERsolve - * This is the group of double Other Solve routines - * @defgroup doubleOTHEReigen double - * @ingroup OTHEReigen - * This is the group of double Other Eigenvalue routines - * - **** - * - * @defgroup realGEsolve real - * @ingroup solveGE - * This is the group of real solve driver functions for GE matrices - * @defgroup realPOsolve real - * @ingroup solvePO - * This is the group of real solve driver functions for PO matrices - * @defgroup realSYsolve real - * @ingroup solveSY - * This is the group of real solve driver functions for SY matrices - * @defgroup realGBsolve real - * @ingroup solveGB - * This is the group of real solve driver functions for GB matrices - * @defgroup realGTsolve real - * @ingroup solveGT - * This is the group of real solve driver functions for GT matrices - * @defgroup realPTsolve real - * @ingroup solvePT - * This is the group of real solve driver functions for PT matrices - * @defgroup realGEeigen real - * @ingroup eigenGE - * This is the group of real eigenvalue driver functions for GE matrices - * @defgroup realSYeigen real - * @ingroup eigenSY - * This is the group of real eigenvalue driver functions for SY matrices - * @defgroup realGEsing real - * @ingroup singGE - * This is the group of real singular value driver functions for GE matrices - * @defgroup realGEcomputational real - * @ingroup computationalGE - * This is the group of real computational functions for GE matrices - * @defgroup realPOcomputational real - * @ingroup computationalPO - * This is the group of real computational functions for PO matrices - * @defgroup realSYcomputational real - * @ingroup computationalSY - * This is the group of real computational functions for SY matrices - * @defgroup realGBcomputational real - * @ingroup computationalGB - * This is the group of real computational functions for GB matrices - * @defgroup realPTcomputational real - * @ingroup computationalPT - * This is the group of real computational functions for PT matrices - * @defgroup realGTcomputational real - * @ingroup computationalGT - * This is the group of real computational functions for GT matrices - * @defgroup realGEauxiliary real - * @ingroup auxiliaryGE - * This is the group of real auxiliary functions for GE matrices - * @defgroup realPOauxiliary real - * @ingroup auxiliaryPO - * This is the group of real auxiliary functions for PO matrices - * @defgroup realSYauxiliary real - * @ingroup auxiliarySY - * This is the group of real auxiliary functions for SY matrices - * @defgroup realGBauxiliary real - * @ingroup auxiliaryGB - * This is the group of real auxiliary functions for GB matrices - * @defgroup realGTauxiliary real - * @ingroup auxiliaryGT - * This is the group of real auxiliary functions for GT matrices - * @defgroup realPTauxiliary real - * @ingroup auxiliaryPT - * This is the group of real auxiliary functions for PT matrices - * @defgroup realOTHERauxiliary real - * @ingroup OTHERauxiliary - * This is the group of real other auxiliary routines - * @defgroup realOTHERcomputational real - * @ingroup OTHERcomputational - * This is the group of real other Computational routines - * @defgroup realOTHERsolve real Other Solve Routines - * @ingroup OTHERsolve - * This is the group of real Other Solve routines - * @defgroup realOTHEReigen real - * @ingroup OTHEReigen - * This is the group of real Other Eigenvalue routines - * - **** - * - * @defgroup complexGEsolve complex - * @ingroup solveGE - * This is the group of complex solve driver functions for GE matrices - * @defgroup complexPOsolve complex - * @ingroup solvePO - * This is the group of complex solve driver functions for PO matrices - * @defgroup complexSYsolve complex - * @ingroup solveSY - * This is the group of complex solve driver functions for SY matrices - * @defgroup complexHEsolve complex - * @ingroup solveHE - * This is the group of complex solve driver functions for HE matrices - * @defgroup complexGBsolve complex - * @ingroup solveGB - * This is the group of complex solve driver functions for GB matrices - * @defgroup complexGTsolve complex - * @ingroup solveGT - * This is the group of complex solve driver functions for GT matrices - * @defgroup complexPTsolve complex - * @ingroup solvePT - * This is the group of complex solve driver functions for PT matrices - * @defgroup complexGEeigen complex - * @ingroup eigenGE - * This is the group of complex eigenvalue driver functions for GE matrices - * @defgroup complexSYeigen complex - * @ingroup eigenSY - * This is the group of complex eigenvalue driver functions for SY matrices - * @defgroup complexHEeigen complex - * @ingroup eigenHE - * This is the group of complex eigenvalue driver functions for HE matrices - * @defgroup complexGEsing complex - * @ingroup singGE - * This is the group of complex singular value driver functions for GE matrices - * @defgroup complexGEcomputational complex - * @ingroup computationalGE - * This is the group of complex computational functions for GE matrices - * @defgroup complexPOcomputational complex - * @ingroup computationalPO - * This is the group of complex computational functions for PO matrices - * @defgroup complexSYcomputational complex - * @ingroup computationalSY - * This is the group of complex computational functions for SY matrices - * @defgroup complexHEcomputational complex - * @ingroup computationalHE - * This is the group of complex computational functions for HE matrices - * @defgroup complexGBcomputational complex - * @ingroup computationalGB - * This is the group of complex computational functions for GB matrices - * @defgroup complexGTcomputational complex - * @ingroup computationalGT - * This is the group of complex computational functions for GT matrices - * @defgroup complexPTcomputational complex - * @ingroup computationalPT - * This is the group of complex computational functions for PT matrices - * @defgroup complexGEauxiliary complex - * @ingroup auxiliaryGE - * This is the group of complex auxiliary functions for GE matrices - * @defgroup complexPOauxiliary complex - * @ingroup auxiliaryPO - * This is the group of complex auxiliary functions for PO matrices - * @defgroup complexSYauxiliary complex - * @ingroup auxiliarySY - * This is the group of complex auxiliary functions for SY matrices - * @defgroup complexHEauxiliary complex - * @ingroup auxiliaryHE - * This is the group of complex auxiliary functions for HE matrices - * @defgroup complexGBauxiliary complex - * @ingroup auxiliaryGB - * This is the group of complex auxiliary functions for GB matrices - * @defgroup complexOTHERauxiliary complex - * @ingroup OTHERauxiliary - * This is the group of complex other auxiliary routines - * @defgroup complexOTHERcomputational complex - * @ingroup OTHERcomputational - * This is the group of complex other Computational routines - * @defgroup complexOTHERsolve complex Other Solve Routines - * @ingroup OTHERsolve - * This is the group of complex Other Solve routines - * @defgroup complexOTHEReigen complex Other Eigenvalue routines - * @ingroup OTHEReigen - * This is the group of complex Other Eigenvalue routines - * - **** - * - * @defgroup complex16GEsolve complex16 - * @ingroup solveGE - * This is the group of complex16 solve driver functions for GE matrices - * @defgroup complex16POsolve complex16 - * @ingroup solvePO - * This is the group of complex16 solve driver functions for PO matrices - * @defgroup complex16SYsolve complex16 - * @ingroup solveSY - * This is the group of complex16 solve driver functions for SY matrices - * @defgroup complex16HEsolve complex16 - * @ingroup solveHE - * This is the group of complex16 solve driver functions for HE matrices - * @defgroup complex16GBsolve complex16 - * @ingroup solveGB - * This is the group of complex16 solve driver functions for GB matrices - * @defgroup complex16GTsolve complex16 - * @ingroup solveGT - * This is the group of complex16 solve driver functions for GT matrices - * @defgroup complex16PTsolve complex16 - * @ingroup solvePT - * This is the group of complex16 solve driver functions for PT matrices - * @defgroup complex16GEeigen complex16 - * @ingroup eigenGE - * This is the group of complex16 eigenvalue driver functions for GE matrices - * @defgroup complex16SYeigen complex16 - * @ingroup eigenSY - * This is the group of complex16 eigenvalue driver functions for SY matrices - * @defgroup complex16HEeigen complex16 - * @ingroup eigenHE - * This is the group of complex16 eigenvalue driver functions for HE matrices - * @defgroup complex16GEsing complex16 - * @ingroup singGE - * This is the group of complex16 singular value driver functions for GE matrices - * @defgroup complex16GEcomputational complex16 - * @ingroup computationalGE - * This is the group of complex16 computational functions for GE matrices - * @defgroup complex16POcomputational complex16 - * @ingroup computationalPO - * This is the group of complex16 computational functions for PO matrices - * @defgroup complex16SYcomputational complex16 - * @ingroup computationalSY - * This is the group of complex16 computational functions for SY matrices - * @defgroup complex16HEcomputational complex16 - * @ingroup computationalHE - * This is the group of complex16 computational functions for HE matrices - * @defgroup complex16GBcomputational complex16 - * @ingroup computationalGB - * This is the group of complex16 computational functions for GB matrices - * @defgroup complex16GTcomputational complex16 - * @ingroup computationalGT - * This is the group of complex16 computational functions for GT matrices - * @defgroup complex16PTcomputational complex16 - * @ingroup computationalPT - * This is the group of complex16 computational functions for PT matrices - * @defgroup complex16GEauxiliary complex16 - * @ingroup auxiliaryGE - * This is the group of complex16 auxiliary functions for GE matrices - * @defgroup complex16POauxiliary complex16 - * @ingroup auxiliaryPO - * This is the group of complex16 auxiliary functions for PO matrices - * @defgroup complex16SYauxiliary complex16 - * @ingroup auxiliarySY - * This is the group of complex16 auxiliary functions for SY matrices - * @defgroup complex16HEauxiliary complex16 - * @ingroup auxiliaryHE - * This is the group of complex16 auxiliary functions for HE matrices - * @defgroup complex16GBauxiliary complex16 - * @ingroup auxiliaryGB - * This is the group of complex16 auxiliary functions for GB matrices - * @defgroup complex16OTHERcomputational complex16 - * @ingroup OTHERcomputational - * This is the group of complex16 other Computational routines - * @defgroup complex16OTHERauxiliary complex16 - * @ingroup OTHERauxiliary - * This is the group of complex16 other auxiliary routines - * @defgroup auxOTHERcomputational auxiliary Computational routines - * @ingroup OTHERcomputational - * This is the group of auxiliary Computational routines - * @defgroup complex16OTHERsolve complex16 Other Solve Routines - * @ingroup OTHERsolve - * This is the group of complex16 Other Solve routines - * @defgroup complex16OTHEReigen complex16 Other Eigenvalue routines - * @ingroup OTHEReigen - * This is the group of complex16 Other Eigenvalue routines - * - **** - * - * @defgroup testing LAPACK Testing - * This is the group of LAPACK TESTING routines. - * - * @defgroup matgen Matrix Generation - * @ingroup testing - * This is the group of LAPACK TESTING MATGEN routines. - * - * @defgroup lin Linear Solve - * @ingroup testing - * This is the group of LAPACK TESTING LIN routines. - * - * @defgroup eig Eigenvalue and Singular value - * @ingroup testing - * This is the group of LAPACK TESTING EIG routines. - * - * @defgroup real_matgen real - * @ingroup matgen - * This is the group of real LAPACK TESTING MATGEN routines. - * - * @defgroup double_matgen double - * @ingroup matgen - * This is the group of double LAPACK TESTING MATGEN routines. - * - * @defgroup complex_matgen complex - * @ingroup matgen - * This is the group of complex LAPACK TESTING MATGEN routines. - * - * @defgroup complex16_matgen complex16 - * @ingroup matgen - * This is the group of complex16 LAPACK TESTING MATGEN routines. - * - * @defgroup aux_matgen aux - * @ingroup matgen - * This is the group of auxiliary LAPACK TESTING MATGEN routines. - * - * @defgroup single_lin real - * @ingroup lin - * This is the group of real LAPACK TESTING LIN routines. - * - * @defgroup double_lin double - * @ingroup lin - * This is the group of double LAPACK TESTING LIN routines. - * - * @defgroup complex_lin complex - * @ingroup lin - * This is the group of complex LAPACK TESTING LIN routines. - * - * @defgroup complex16_lin complex16 - * @ingroup lin - * This is the group of complex16 LAPACK TESTING LIN routines. - * - * @defgroup aux_lin aux - * @ingroup lin - * This is the group of auxiliary LAPACK TESTING LIN routines. - * - * @defgroup single_eig real - * @ingroup eig - * This is the group of real LAPACK TESTING EIG routines. - * - * @defgroup double_eig double - * @ingroup eig - * This is the group of double LAPACK TESTING EIG routines. - * - * @defgroup complex_eig complex - * @ingroup eig - * This is the group of complex LAPACK TESTING EIG routines. - * - * @defgroup complex16_eig complex16 - * @ingroup eig - * This is the group of complex16 LAPACK TESTING EIG routines. - * - * @defgroup aux_eig aux - * @ingroup eig - * This is the group of auxiliary LAPACK TESTING EIG routines. - * - **** - * @defgroup blas Reference BLAS - * This is the group of BLAS routines. - * - * @defgroup level1 Level1 - * @ingroup blas - * This is the group of LEVEL 1 BLAS routines. - * @defgroup level2 Level2 - * @ingroup blas - * This is the group of LEVEL 2 BLAS routines. - * @defgroup level3 Level3 - * @ingroup blas - * This is the group of LEVEL 3 BLAS routines. - * @defgroup aux_blas Auxiliary BLAS - * @ingroup blas - * This is the group of Auxiliary 3 BLAS routines. -* @defgroup blastesting Testing - * @ingroup blas - * This is the group of BLAS TESTING routines. - * - * @defgroup single_blas_level1 real - * @ingroup level1 - * This is the group of real LEVEL 1 BLAS routines. - * @defgroup double_blas_level1 double - * @ingroup level1 - * This is the group of double LEVEL 1 BLAS routines. - * @defgroup complex_blas_level1 complex - * @ingroup level1 - * This is the group of complex LEVEL 1 BLAS routines. - * @defgroup complex16_blas_level1 complex16 - * @ingroup level1 - * This is the group of complex16 LEVEL 1 BLAS routines. - * - * @defgroup single_blas_level2 real - * @ingroup level2 - * This is the group of real LEVEL 2 BLAS routines. - * @defgroup double_blas_level2 double - * @ingroup level2 - * This is the group of double LEVEL 2 BLAS routines. - * @defgroup complex_blas_level2 complex - * @ingroup level2 - * This is the group of complex LEVEL 2 BLAS routines. - * @defgroup complex16_blas_level2 complex16 - * @ingroup level2 - * This is the group of complex16 LEVEL 2 BLAS routines. - * - * @defgroup single_blas_level3 real - * @ingroup level3 - * This is the group of real LEVEL 3 BLAS routines. - * @defgroup double_blas_level3 double - * @ingroup level3 - * This is the group of double LEVEL 3 BLAS routines. - * @defgroup complex_blas_level3 complex - * @ingroup level3 - * This is the group of complex LEVEL 3 BLAS routines. - * @defgroup complex16_blas_level3 complex16 - * @ingroup level3 - * This is the group of complex16 LEVEL 3 BLAS routines. - * - * @defgroup single_blas_testing real - * @ingroup blastesting - * This is the group of real BLAS TESTING routines. - * @defgroup double_blas_testing double - * @ingroup blastesting - * This is the group of double BLAS TESTING routines. - * @defgroup complex_blas_testing complex - * @ingroup blastesting - * This is the group of complex BLAS TESTING routines. - * @defgroup complex16_blas_testing complex16 - * @ingroup blastesting - * This is the group of complex16 BLAS TESTING routines. - * + +@defgroup lapack_top LAPACK +@{ + @defgroup solve_top Linear solve, AX = B + @{ + @defgroup gesv_driver_grp LU: General matrix, driver + @{ + @defgroup gesv_driver --- full --- + @defgroup gesv gesv: factor and solve + @defgroup gesvx gesvx: factor and solve, expert + @defgroup gesvxx gesvxx: factor and solve, extra precise + @defgroup gesv_mixed gesv: factor and solve, mixed precision + + @defgroup gbsv_driver --- banded --- + @defgroup gbsv gbsv: factor and solve + @defgroup gbsvx gbsvx: factor and solve, expert + @defgroup gbsvxx gbsvxx: factor and solve, extra precise + + @defgroup gtsv_driver --- tridiagonal --- + @defgroup gtsv gtsv: factor and solve + @defgroup gtsvx gtsvx: factor and solve, expert + @} + + @defgroup gesv_comp_grp LU: computational routines (factor, cond, etc.) + @{ + @defgroup gesv_comp --- full --- + @defgroup gecon gecon: condition number estimate + @defgroup getrf getrf: triangular factor + @defgroup getrf2 getrf2: triangular factor panel, recursive? + @defgroup getf2 getf2: triangular factor panel, level 2 + @defgroup getrs getrs: triangular solve using factor + @defgroup getri getri: triangular inverse + @defgroup gerfs gerfs: iterative refinement + @defgroup gerfsx gerfsx: iterative refinement, expert + @defgroup geequ geequ: equilibration + @defgroup geequb geequb: equilibration, power of 2 + @defgroup laqge laqge: row/col scale matrix + @defgroup laswp laswp: swap permutation + + @defgroup getc2 getc2: triangular factor, with complete pivoting + @defgroup gesc2 gesc2: triangular solve using factor, with complete pivoting + @defgroup latdf latdf: Dif-estimate with complete pivoting LU, step in tgsen + + @defgroup la_gercond la_gercond: Skeel condition number estimate + @defgroup la_gerpvgrw la_gerpvgrw: reciprocal pivot growth + @defgroup la_gerfsx_extended la_gerfsx_extended: step in gerfsx + + @defgroup gbsv_comp --- banded --- + @defgroup gbcon gbcon: condition number estimate + @defgroup gbtrf gbtrf: triangular factor + @defgroup gbtf2 gbtf2: triangular factor, level 2 + @defgroup gbtrs gbtrs: triangular solve using factor + @defgroup gbrfs gbrfs: iterative refinement + @defgroup gbrfsx gbrfsx: iterative refinement, expert + @defgroup gbequ gbequ: equilibration + @defgroup gbequb gbequb: equilibration, power of 2 + @defgroup laqgb laqgb: row/col scale matrix + @defgroup la_gbrcond la_gbrcond: Skeel condition number estimate + @defgroup la_gbrpvgrw la_gbrpvgrw: reciprocal pivot growth + @defgroup la_gbrfsx_extended la_gbrfsx_extended: step in gbrfsx + + @defgroup gtsv_comp --- tridiagonal --- + @defgroup gtcon gtcon: condition number estimate + @defgroup gttrf gttrf: triangular factor + @defgroup gttrs gttrs: triangular solve using factor + @defgroup gtts2 gtts2: triangular solve using factor + @defgroup gtrfs gtrfs: iterative refinement + @} + + @defgroup posv_driver_grp Cholesky: Hermitian/symmetric positive definite matrix, driver + @{ + @defgroup posv_driver --- full --- + @defgroup posv posv: factor and solve + @defgroup posvx posvx: factor and solve, expert + @defgroup posvxx posvxx: factor and solve, extra precise + @defgroup posv_mixed posv: factor and solve, mixed precision + + @defgroup ppsv_driver --- packed --- + @defgroup ppsv ppsv: factor and solve + @defgroup ppsvx ppsvx: factor and solve, expert + + @defgroup pfsv_driver --- rectangular full packed (RFP) --- + @defgroup pfsv pfsv: factor and solve [not available] + + @defgroup pbsv_driver --- banded --- + @defgroup pbsv pbsv: factor and solve + @defgroup pbsvx pbsvx: factor and solve, expert + + @defgroup ptsv_driver --- tridiagonal --- + @defgroup ptsv ptsv: factor and solve + @defgroup ptsvx ptsvx: factor and solve, expert + @} + + @defgroup posv_comp_grp Cholesky: computational routines (factor, cond, etc.) + @{ + @defgroup posv_comp --- full --- + @defgroup pocon pocon: condition number estimate + @defgroup potrf potrf: triangular factor + @defgroup potrf2 potrf2: triangular factor panel, recursive? + @defgroup potf2 potf2: triangular factor panel, level 2 + @defgroup pstrf pstrf: triangular factor, with pivoting + @defgroup pstf2 pstf2: triangular factor, with pivoting panel, level 2 + @defgroup potrs potrs: triangular solve using factor + @defgroup potri potri: triangular inverse + @defgroup porfs porfs: iterative refinement + @defgroup porfsx porfsx: iterative refinement, expert + @defgroup poequ poequ: equilibration + @defgroup poequb poequb: equilibration, power of 2 + @defgroup laqhe laqhe: row/col scale matrix + @defgroup la_porcond la_porcond: Skeel condition number estimate + @defgroup la_porpvgrw la_porpvgrw: reciprocal pivot growth + @defgroup la_porfsx_extended la_porfsx_extended: step in porfsx + + @defgroup ppsv_comp --- packed --- + @defgroup ppcon ppcon: condition number estimate + @defgroup pptrf pptrf: triangular factor + @defgroup pptrs pptrs: triangular solve using factor + @defgroup pptri pptri: triangular inverse + @defgroup pprfs pprfs: iterative refinement + @defgroup ppequ ppequ: equilibration + @defgroup laqhp laqhp: row/col scale matrix + + @defgroup pfsv_comp --- rectangular full packed (RFP) --- + @defgroup pftrf pftrf: triangular factor + @defgroup pftrs pftrs: triangular solve using factor + @defgroup pftri pftri: triangular inverse + + @defgroup pbsv_comp --- banded --- + @defgroup pbcon pbcon: condition number estimate + @defgroup pbtrf pbtrf: triangular factor + @defgroup pbtf2 pbtf2: triangular factor panel, level 2 + @defgroup pbtrs pbtrs: triangular solve using factor + @defgroup pbrfs pbrfs: iterative refinement + @defgroup pbequ pbequ: equilibration + @defgroup laqhb laqhb: row/col scale matrix + + @defgroup ptsv_comp --- tridiagonal --- + @defgroup ptcon ptcon: condition number estimate + @defgroup pttrf pttrf: triangular factor + @defgroup pttrs pttrs: triangular solve using factor + @defgroup ptts2 ptts2: triangular solve using factor, unblocked + @defgroup ptrfs ptrfs: iterative refinement + @} + + @defgroup hesv_driver_grp LDL: Hermitian/symmetric indefinite matrix, driver + @{ + @defgroup hesv_driver --- full, rook pivoting --- + @defgroup hesv {he,sy}sv: rook (v1) + @defgroup hesv_rook {he,sy}sv_rook: rook (v2) + @defgroup hesv_rk {he,sy}sv_rk: rook (v3) + @defgroup hesvx {he,sy}svx: rook (v1, expert) + @defgroup hesvxx {he,sy}svxx: rook (v1, expert) + + @defgroup hpsv_driver --- packed, rook pivoting --- + @defgroup hpsv {hp,sp}sv: factor and solve + @defgroup hpsvx {hp,sp}svx: factor and solve, expert + + @defgroup hesv_aa_driver --- full, Aasen --- + @defgroup hesv_aa {he,sy}sv_aa: Aasen + @defgroup hesv_aa_2stage {he,sy}sv_aa_2stage: Aasen, blocked 2-stage + @} + + @defgroup hesv_comp_grp LDL: computational routines (factor, cond, etc.) + @{ + @defgroup hesv_comp_v1 --- full, rook v1 --- + @defgroup hecon {he,sy}con: condition number estimate + @defgroup hetrf {he,sy}trf: triangular factor + @defgroup lahef la{he,sy}f: step in hetrf + @defgroup hetf2 {he,sy}tf2: triangular factor, level 2 + @defgroup hetrs {he,sy}trs: triangular solve using factor + @defgroup hetri {he,sy}tri: triangular inverse + @defgroup herfs {he,sy}rfs: iterative refinement + @defgroup herfsx {he,sy}rfsx: iterative refinement, expert + @defgroup heequb {he,sy}equb: equilibration, power of 2 + @defgroup syconv syconv: convert to/from L and D from hetrf + + @defgroup hecon_3 {he,sy}con_3: condition number estimate + @defgroup hetri2 {he,sy}tri2: inverse + @defgroup hetri2x {he,sy}tri2x: inverse + @defgroup hetri_3 {he,sy}tri_3: inverse + @defgroup hetri_3x {he,sy}tri_3x: inverse + @defgroup hetrs2 {he,sy}trs2: solve using factor + @defgroup hetrs_3 {he,sy}trs_3: solve using factor + + @defgroup heswapr {he,sy}swapr: apply 2-sided permutation + @defgroup la_hercond la_hercond: Skeel condition number estimate + @defgroup la_herfsx_extended la_herfsx_extended: step in herfsx + @defgroup la_herpvgrw la_herpvgrw: reciprocal pivot growth + + @defgroup hpsv_comp --- packed, rook v1 --- + @defgroup hpcon {hp,sp}con: condition number estimate + @defgroup hptrf {hp,sp}trf: triangular factor + @defgroup hptrs {hp,sp}trs: triangular solve using factor + @defgroup hptri {hp,sp}tri: triangular inverse + @defgroup hprfs {hp,sp}rfs: iterative refinement + + @defgroup hesv_comp_v2 --- full, rook v2 --- + @defgroup hecon_rook {he,sy}con_rook: condition number estimate + @defgroup hetrf_rook {he,sy}trf_rook: triangular factor + @defgroup lahef_rook la{he,sy}f_rook: triangular factor step + @defgroup hetf2_rook {he,sy}tf2_rook: triangular factor, level 2 + @defgroup hetrs_rook {he,sy}trs_rook: triangular solve using factor + @defgroup hetri_rook {he,sy}tri_rook: triangular inverse + + @defgroup hesv_comp_v3 --- full, rook v3 --- + @defgroup hetrf_rk {he,sy}trf_rk: triangular factor + @defgroup lahef_rk la{he,sy}f_rk: triangular factor step + @defgroup hetf2_rk {he,sy}tf2_rk: triangular factor, level 2 + @defgroup syconvf syconvf: convert to/from hetrf to hetrf_rk format + @defgroup syconvf_rook syconvf_rook: convert to/from hetrf_rook to hetrf_rk format + + @defgroup hesv_comp_aasen --- full, Aasen --- + @defgroup hetrf_aa {he,sy}trf_aa: triangular factor + @defgroup lahef_aa la{he,sy}f_aa: triangular factor partial factor + @defgroup hetrs_aa {he,sy}trs_aa: triangular solve using factor + + @defgroup hesv_comp_aasen2 --- full, Aasen, blocked 2-stage --- + @defgroup hetrf_aa_2stage {he,sy}trf_aa_2stage: triangular factor + @defgroup hetrs_aa_2stage {he,sy}trs_aa_2stage: triangular solve using factor + @} + + @defgroup trsv_comp_grp Triangular computational routines (solve, cond, etc.) + @{ + @defgroup trsv_comp --- full --- + @defgroup trcon trcon: condition number estimate + @defgroup trtrs trtrs: triangular solve + @defgroup latrs latrs: triangular solve with robust scaling + @defgroup latrs3 latrs3: triangular solve with robust scaling, level 3 + @defgroup trtri trtri: triangular inverse + @defgroup trti2 trti2: triangular inverse, level 2 + @defgroup trrfs trrfs: triangular iterative refinement + @defgroup lauum lauum: triangular multiply: U^H U + @defgroup lauu2 lauu2: triangular multiply: U^H U, level 2 + + @defgroup tpsv_comp --- packed --- + @defgroup tpcon tpcon: condition number estimate + @defgroup tptrs tptrs: triangular solve + @defgroup latps latps: triangular solve with robust scaling + @defgroup tptri tptri: triangular inverse + @defgroup tprfs tprfs: triangular iterative refinement + + @defgroup tfsv_comp --- rectangular full packed (RFP) --- + @defgroup tftri tftri: triangular inverse, RFP + + @defgroup tbsv_comp --- banded --- + @defgroup tbcon tbcon: condition number estimate + @defgroup tbtrs tbtrs: triangular solve + @defgroup latbs latbs: triangular solve with scaling + @defgroup tbrfs tbrfs: triangular iterative refinement + @} + + @defgroup solve_aux_grp Auxiliary routines + @{ + @defgroup lacn2 lacn2: 1-norm estimate, e.g., || A^{-1} ||_1 in gecon + @defgroup lacon lacon: 1-norm estimate, e.g., || A^{-1} ||_1 in gecon, old + @defgroup la_lin_berr la_lin_berr: backward error + @} + @} + + @defgroup gels_top Least squares + @{ + @defgroup gels_driver_grp Standard least squares, min || Ax - b ||_2 + @{ + @defgroup gels gels: least squares using QR/LQ + @defgroup gelst gelst: least squares using QR/LQ with T matrix + @defgroup gelss gelss: least squares using SVD, QR iteration + @defgroup gelsd gelsd: least squares using SVD, divide and conquer + @defgroup gelsy gelsy: least squares using complete orthogonal factor + @defgroup getsls getsls: least squares using tall-skinny QR/LQ + @} + + @defgroup ggls_driver_grp Constrained least squares + @{ + @defgroup gglse gglse: equality-constrained least squares + @defgroup ggglm ggglm: Gauss-Markov linear model + @} + + @defgroup gels_aux_grp Auxiliary routines + @{ + @defgroup laic1 laic1: condition estimate, step in gelsy + @defgroup lals0 lals0: back multiplying factors, step in gelsd + @defgroup lalsa lalsa: SVD of coefficient matrix, step in gelsd + @defgroup lalsd lalsd: uses SVD for least squares, step in gelsd + @} + @} + + @defgroup unitary_top Orthogonal/unitary factors (QR, CS, etc.) + @{ + @defgroup geqr_comp_grp QR + @{ + @defgroup geqr_comp1 --- flexible --- + @defgroup geqr geqr: QR factor, flexible + @defgroup gemqr gemqr: multiply by Q from geqr + + @defgroup geqr_comp2 --- classic --- + @defgroup geqrf geqrf: QR factor + @defgroup geqr2 geqr2: QR factor, level 2 + @defgroup ungqr {un,or}gqr: generate explicit Q from geqrf + @defgroup ung2r {un,or}g2r: generate explicit Q from geqrf, level 2 + @defgroup unmqr {un,or}mqr: multiply by Q from geqrf + @defgroup unm2r {un,or}m2r: multiply by Q from geqrf, level 2 + + @defgroup geqr_comp3 --- with T --- + @defgroup geqrt geqrt: QR factor, with T + @defgroup geqrt2 geqrt2: QR factor, with T, level 2 + @defgroup geqrt3 geqrt3: QR factor, with T, recursive panel + @defgroup gemqrt gemqrt: multiply by Q from geqrt + + @defgroup geqr_comp4 --- positive --- + @defgroup geqrfp geqrfp: QR factor, diag( R ) ≥ 0 + @defgroup geqr2p geqr2p: QR factor, diag( R ) ≥ 0, level 2 + @} + + @defgroup geqpf_comp_grp QR with pivoting + @{ + @defgroup geqp3 geqp3: QR factor with pivoting, level 3 + @defgroup laqp2 laqp2: step of geqp3 + @defgroup laqps laqps: step of geqp3 + @} + + @defgroup getsqr_comp_grp QR, tall-skinny + @{ + @defgroup latsqr latsqr: tall-skinny QR factor + @defgroup ungtsqr {un,or}gtsqr: generate Q from latsqr + @defgroup ungtsqr_row {un,or}gtsqr_row: generate Q from latsqr + @defgroup larfb_gett larfb_gett: step in ungtsqr_row + @defgroup lamtsqr lamtsqr: multiply by Q from latsqr + @defgroup getsqrhrt getsqrhrt: tall-skinny QR factor, with Householder reconstruction + @defgroup unhr_col {un,or}hr_col: Householder reconstruction + @defgroup launhr_col_getrfnp la{un,or}hr_col_getrfnp: LU factor without pivoting + @defgroup launhr_col_getrfnp2 la{un,or}hr_col_getrfnp2: LU factor without pivoting, level 2 + @} + + @defgroup tpqr_comp_grp QR, triangular-pentagonal + @{ + @defgroup tpqrt tpqrt: QR factor + @defgroup tpqrt2 tpqrt2: QR factor, level 2 + @defgroup tpmqrt tpmqrt: applies Q + @defgroup tprfb tprfb: applies Q (like larfb) + @} + + @defgroup ggqr_comp_grp Generalized QR + @{ + @defgroup ggqrf ggqrf: Generalized QR factor + @} + + @defgroup gelq_comp_grp LQ + @{ + @defgroup gelq_comp1 --- flexible --- + @defgroup gelq gelq: LQ factor, flexible + @defgroup gemlq gemlq: multiply by Q from gelq + + @defgroup gelq_comp2 --- classic --- + @defgroup gelqf gelqf: LQ factor + @defgroup gelq2 gelq2: LQ factor, level 2 + @defgroup unglq {un,or}glq: generate explicit Q from gelqf + @defgroup ungl2 {un,or}gl2: generate explicit Q, level 2, step in unglq + @defgroup unmlq {un,or}mlq: multiply by Q from gelqf + @defgroup unml2 {un,or}ml2: multiply by Q, level 2, step in unmlq + + @defgroup gelq_comp3 --- with T --- + @defgroup gelqt gelqt: LQ factor, with T + @defgroup gelqt3 gelqt3: LQ factor, with T, recursive + @defgroup gemlqt gemlqt: multiply by Q from gelqt + @} + + @defgroup geswlq_comp_grp LQ, short-wide + @{ + @defgroup laswlq laswlq: short-wide LQ factor + @defgroup lamswlq lamswlq: multiply by Q from laswlq + @} + + @defgroup tplq_comp_grp LQ, triangular-pentagonal + @{ + @defgroup tplqt tplqt: QR factor + @defgroup tplqt2 tplqt2: QR factor, level 2 + @defgroup tpmlqt tpmlqt: applies Q + @} + + @defgroup geql_comp_grp QL + @{ + @defgroup geqlf geqlf: QL factor + @defgroup geql2 geql2: QL factor, level 2 + @defgroup ungql {un,or}gql: generate explicit Q from geqlf + @defgroup unmql {un,or}mql: multiply by Q from geqlf + @defgroup ung2l {un,or}g2l: step in ungql + @defgroup unm2l {un,or}m2l: step in unmql + @} + + @defgroup gerq_comp_grp RQ + @{ + @defgroup gerqf gerqf: RQ factor + @defgroup gerq2 gerq2: RQ factor, level 2 + @defgroup ungrq {un,or}grq: generate explicit Q from gerqf + @defgroup unmrq {un,or}mrq: multiply by Q from gerqf + @defgroup unmr2 {un,or}mr2: step in unmrq + @defgroup ungr2 {un,or}gr2: step in ungrq + @} + + @defgroup ggrq_comp_grp Generalized RQ + @{ + @defgroup ggrqf ggrqf: Generalized RQ factor + @} + + @defgroup gerz_comp_grp RZ + @{ + @defgroup tzrzf tzrzf: RZ factor + @defgroup latrz latrz: RZ factor step + @defgroup unmrz {un,or}mrz: multiply by Z from tzrzf + @defgroup unmr3 {un,or}mr3: step in unmrz + @defgroup larz larz: apply reflector + @defgroup larzb larzb: apply block reflector + @defgroup larzt larzt: generate T matrix + @} + + @defgroup gecs_comp_grp Cosine-Sine (CS) decomposition + @{ + @defgroup bbcsd bbcsd: ?? + @defgroup uncsd {un,or}csd: ?? + @defgroup uncsd2by1 {un,or}csd2by1: ?? + @defgroup unbdb {un,or}bdb: bidiagonalize partitioned unitary matrix, step in uncsd + @defgroup unbdb1 {un,or}bdb1: step in uncsd2by1 + @defgroup unbdb2 {un,or}bdb2: step in uncsd2by1 + @defgroup unbdb3 {un,or}bdb3: step in uncsd2by1 + @defgroup unbdb4 {un,or}bdb4: step in uncsd2by1 + @defgroup unbdb5 {un,or}bdb5: step in uncsd2by1 + @defgroup unbdb6 {un,or}bdb6: step in uncsd2by1 + + @defgroup lapmr lapmr: permute rows + @defgroup lapmt lapmt: permute cols + @} + + @defgroup reflector_aux_grp Householder reflectors + @{ + @defgroup larf larf: apply Householder reflector + @defgroup larfx larfx: apply Householder reflector, unrolled + @defgroup larfy larfy: apply Householder reflector symmetrically (2-sided) + @defgroup larfb larfb: apply block Householder reflector + @defgroup larfg larfg: generate Householder reflector + @defgroup larfgp larfgp: generate Householder reflector, beta ≥ 0 + @defgroup larft larft: generate T matrix + @} + + @defgroup rot_aux_grp Givens/Jacobi plane rotations + @{ + @defgroup lartg lartg: generate plane rotation, more accurate than BLAS rot + @defgroup lartgp lartgp: generate plane rotation, more accurate than BLAS rot + @defgroup lasr lasr: apply series of plane rotations + @defgroup largv largv: generate vector of plane rotations + @defgroup lartv lartv: apply vector of plane rotations to vectors + @defgroup lar2v lar2v: apply vector of plane rotations to 2x2 matrices + @defgroup lacrt lacrt: apply plane rotation (unused?) + @} + @} + + @defgroup geev_top Non-symmetric eigenvalues + @{ + @defgroup geev_driver_grp Standard eig driver, AV = VΛ + @{ + @defgroup geev geev: eig + @defgroup geevx geevx: eig, expert + + @defgroup gees gees: Schur form + @defgroup geesx geesx: Schur form, expert + @} + + @defgroup ggev_driver_grp Generalized eig driver + @{ + @defgroup ggev3 ggev3: eig + @defgroup ggev ggev: eig, unblocked + @defgroup ggevx ggevx: eig, expert + + @defgroup gges3 gges3: Schur form + @defgroup gges gges: Schur form, unblocked + @defgroup ggesx ggesx: Schur form, expert + @} + + @defgroup gedmd DMD driver, Dynamic Mode Decomposition + + @defgroup geev_comp_grp Eig computational routines + @{ + @defgroup gebal gebal: balance matrix + @defgroup gehrd gehrd: reduction to Hessenberg + @defgroup gehd2 gehd2: reduction to Hessenberg, level 2 + @defgroup lahr2 lahr2: step in gehrd + @defgroup unghr {un,or}ghr: generate Q from gehrd + @defgroup unmhr {un,or}mhr: multiply by Q from gehrd + @defgroup gebak gebak: back-transform eigvec + @defgroup hseqr hseqr: Hessenberg eig, QR iteration + @defgroup hsein hsein: Hessenberg inverse iteration for eigvec + @defgroup trevc trevc: eigenvectors of triangular Schur form, old + @defgroup trevc3 trevc3: eigenvectors of triangular Schur form, blocked + @defgroup laln2 laln2: 1x1 or 2x2 solve, step in trevc + + @defgroup trsyl trsyl: Sylvester equation + @defgroup trsyl3 trsyl3: Sylvester equation, level 3 + @defgroup lasy2 lasy2: Sylvester equation + + @defgroup trsna trsna: eig condition numbers + @defgroup laqtr laqtr: quasi-triangular solve + + @defgroup trexc trexc: reorder Schur form + @defgroup trsen trsen: reorder Schur form + @defgroup laexc laexc: reorder Schur form + + @defgroup lanv2 lanv2: 2x2 Schur factor + + @defgroup laqr_group --- hseqr auxiliary --- + @defgroup laein laein: eigvec by Hessenberg inverse iteration + @defgroup lahqr lahqr: eig of Hessenberg, step in hseqr + @defgroup laqr0 laqr0: eig of Hessenberg, step in hseqr + @defgroup laqr1 laqr1: step in hseqr + @defgroup laqr2 laqr2: step in hseqr + @defgroup laqr3 laqr3: step in hseqr + @defgroup laqr4 laqr4: eig of Hessenberg, step in hseqr + @defgroup laqr5 laqr5: step in hseqr + + @defgroup iparmq iparmq: set parameters for hseqr + + @defgroup laqz_group --- ggev3, gges3 auxiliary --- + @defgroup laqz0 laqz0: step in ggev3, gges3 + @defgroup laqz1 laqz1: step in ggev3, gges3 + @defgroup laqz2 laqz2: step in ggev3, gges3 + @defgroup laqz3 laqz3: step in ggev3, gges3 + @defgroup laqz4 laqz4: step in ggev3, gges3 + @} + + @defgroup ggev_comp_grp Generalized eig computational routines + @{ + @defgroup ggbal ggbal: balance matrix + @defgroup gghrd gghrd: reduction to Hessenberg + @defgroup gghd3 gghd3: reduction to Hessenberg, level 3 + @defgroup hgeqz hgeqz: generalized Hessenberg eig + @defgroup ggbak ggbak: back-transform eigvec + @defgroup tgsen tgsen: reorder generalized Schur form + @defgroup tgsna tgsna: reciprocal cond est + @defgroup tgsyl tgsyl: Sylvester equation + @defgroup tgsy2 tgsy2: Sylvester equation panel (?) + @defgroup unm22 {un,or}m22: multiply by banded Q, step in gghd3 + @defgroup lagv2 lagv2: 2x2 generalized Schur factor + @defgroup tgevc tgevc: eigvec of pair of matrices + @defgroup tgexc tgexc: reorder generalized Schur form + @defgroup tgex2 tgex2: reorder generalized Schur form + @} + @} + + @defgroup heev_top Hermitian/symmetric eigenvalues + @{ + @defgroup heev_driver_grp Standard eig driver, AV = VΛ + @{ + @defgroup heev_driver --- full --- + @defgroup heev {he,sy}ev: eig, QR iteration + @defgroup heevd {he,sy}evd: eig, divide and conquer + @defgroup heevr {he,sy}evr: eig, MRRR + @defgroup heevx {he,sy}evx: eig, bisection + + @defgroup heev_driver2 --- full, 2-stage --- + @defgroup heev_2stage {he,sy}ev_2stage: eig, QR iteration + @defgroup heevd_2stage {he,sy}evd_2stage: eig, divide and conquer + @defgroup heevr_2stage {he,sy}evr_2stage: eig, MRRR + @defgroup heevx_2stage {he,sy}evx_2stage: eig, bisection + + @defgroup hpev_driver --- packed --- + @defgroup hpev {hp,sp}ev: eig, QR iteration + @defgroup hpevd {hp,sp}evd: eig, divide and conquer + @defgroup hpevx {hp,sp}evx: eig, bisection + + @defgroup hbev_driver --- banded --- + @defgroup hbev {hb,sb}ev: eig, QR iteration + @defgroup hbevd {hb,sb}evd: eig, divide and conquer + @defgroup hbevx {hb,sb}evx: eig, bisection + + @defgroup hbev_driver2 --- banded, 2nd-stage --- + @defgroup hbev_2stage {hb,sb}ev_2stage: eig, QR iteration + @defgroup hbevd_2stage {hb,sb}evd_2stage: eig, divide and conquer + @defgroup hbevx_2stage {hb,sb}evx_2stage: eig, bisection + + @defgroup stev_driver --- tridiagonal --- + @defgroup stev stev: eig, QR iteration + @defgroup stevd stevd: eig, divide and conquer + @defgroup stevr stevr: eig, MRRR + @defgroup stevx stevx: eig, bisection + @defgroup pteqr pteqr: eig, positive definite tridiagonal + + @defgroup stebz stebz: eig, Kahan + @defgroup sterf sterf: eig, QR iteration + @defgroup stedc stedc: eig, divide and conquer + @defgroup stegr stegr: eig, bisection, see stemr + @defgroup stein stein: eig, inverse iteration + @defgroup stemr stemr: eig, relatively robust representation (RRR) + @defgroup steqr steqr: eig, QR iteration + @} + + @defgroup hegv_driver_grp Generalized eig driver, AV = BVΛ, etc. + @{ + @defgroup hegv_driver --- full --- + @defgroup hegv {he,sy}gv: eig, QR iteration + @defgroup hegv_2stage {he,sy}gv_2stage: eig, QR iteration, 2-stage + @defgroup hegvd {he,sy}gvd: eig, divide and conquer + @defgroup hegvx {he,sy}gvx: eig, bisection + + @defgroup hpgv_driver --- packed --- + @defgroup hpgv {hp,sp}gv: eig, QR iteration + @defgroup hpgvd {hp,sp}gvd: eig, divide and conquer + @defgroup hpgvx {hp,sp}gvx: eig, bisection + + @defgroup hbgv_driver --- banded --- + @defgroup hbgv {hb,sb}gv: eig, QR iteration + @defgroup hbgvd {hb,sb}gvd: eig, divide and conquer + @defgroup hbgvx {hb,sb}gvx: eig, bisection + @} + + @defgroup heev_comp_grp Eig computational routines + @{ + @defgroup heev_comp --- full --- + @defgroup disna disna: eig condition numbers + @defgroup hetrd {he,sy}trd: reduction to tridiagonal + @defgroup hetd2 {he,sy}td2: reduction to tridiagonal, level 2 + @defgroup latrd latrd: step in hetrd + @defgroup ungtr {un,or}gtr: generate Q from hetrd + @defgroup unmtr {un,or}mtr: multiply by Q from hetrd + + @defgroup hetrd_2stage {he,sy}trd_2stage: reduction to tridiagonal, 2-stage + @defgroup hetrd_he2hb {he,sy}trd_he2hb: full to band (1st stage) + @defgroup hetrd_hb2st {he,sy}trd_hb2st: band to tridiagonal (2nd stage) + @defgroup hb2st_kernels {hb,sb}2st_kernels: band to tridiagonal (2nd stage) + + @defgroup lae2 lae2: 2x2 eig, step in steqr, stemr + @defgroup laesy laesy: 2x2 eig + @defgroup laev2 laev2: 2x2 eig + @defgroup lagtf lagtf: LU factor of (T - λI) + @defgroup lagts lagts: LU solve of (T - λI) x = y + + @defgroup hpev_comp --- packed --- + @defgroup hptrd {hp,sp}trd: reduction to tridiagonal + @defgroup upgtr {up,op}gtr: generate Q from hetrd + @defgroup upmtr {up,op}mtr: multiply by Q from hptrd + + @defgroup hbev_comp --- banded --- + @defgroup hbtrd {hb,sb}trd: reduction to tridiagonal + @} + + @defgroup hegv_comp_grp Generalized eig computational routines + @{ + @defgroup hegst {he,sy}gst: reduction to standard form + @defgroup hegs2 {he,sy}gs2: reduction to standard form, level 2 + @defgroup hpgst {hp,sp}gst: reduction to standard form, packed + @defgroup hbgst {hb,sb}gst: reduction to standard form, banded + @defgroup pbstf pbstf: split Cholesky factor, use with hbgst + @defgroup lag2 lag2: 2x2 eig + @} + + @defgroup stev_comp_grp tridiag bisection routines + @{ + @defgroup laebz laebz: counts eigvals <= value + @defgroup laneg laneg: Sturm count + @} + + @defgroup laed_comp_grp tridiag divide and conquer (D&C) routines + @{ + @defgroup laed0 laed0: D&C step: top level solver + @defgroup laed1 laed1: D&C step: merge subproblems + @defgroup laed2 laed2: D&C step: deflation + @defgroup laed3 laed3: D&C step: secular equation + @defgroup laed4 laed4: D&C step: secular equation nonlinear solver + @defgroup laed5 laed5: D&C step: secular equation, 2x2 + @defgroup laed6 laed6: D&C step: secular equation Newton step + @defgroup lamrg lamrg: permutation to merge 2 sorted lists + + @defgroup laed_comp2 --- eig value only or update Q --- + @defgroup laed7 laed7: D&C step: merge subproblems + @defgroup laed8 laed8: D&C step: deflation + @defgroup laed9 laed9: D&C step: secular equation + @defgroup laeda laeda: D&C step: z vector + @} + + @defgroup larr_comp_grp tridiag RRR routines + @{ + @defgroup larra larra: step in stemr + @defgroup larrb larrb: step in stemr + @defgroup larrc larrc: step in stemr + @defgroup larrd larrd: step in stemr, tridiag eig + @defgroup larre larre: step in stemr + @defgroup larrf larrf: step in stemr, find relative robust representation (RRR) + @defgroup larrj larrj: step in stemr, refine eigval estimates + @defgroup larrk larrk: step in stemr, compute one eigval + @defgroup larrr larrr: step in stemr, test to do expensive tridiag eig algorithm + @defgroup larrv larrv: eig tridiagonal, step in stemr & stegr + @defgroup lar1v lar1v: step in larrv, hence stemr & stegr + @} + @} + + @defgroup svd_top Singular Value Decomposition (SVD) + @{ + @defgroup svd_driver_grp Standard SVD driver, A = UΣV^H + @{ + @defgroup gesvd_driver --- full --- + @defgroup gesvd gesvd: SVD, QR iteration + @defgroup gesvdq gesvdq: SVD, QR with pivoting + @defgroup gesdd gesdd: SVD, divide and conquer + @defgroup gesvdx gesvdx: SVD, bisection + @defgroup gejsv gejsv: SVD, Jacobi, high-level + @defgroup gesvj gesvj: SVD, Jacobi, low-level + + @defgroup bdsvd_driver --- bidiagonal --- + @defgroup bdsqr bdsqr: bidiagonal SVD, QR iteration (dqds) + @defgroup bdsdc bdsdc: bidiagonal SVD, divide and conquer + @defgroup bdsvdx bdsvdx: bidiagonal SVD, bisection + @} + + @defgroup ggsvd_driver_grp Generalized SVD driver + @{ + @defgroup ggsvd3 ggsvd3: SVD, QR iteration + @} + + @defgroup gesvd_comp_grp SVD computational routines + @{ + @defgroup gebrd gebrd: reduction to bidiagonal + @defgroup gebd2 gebd2: reduction to bidiagonal, level 2 + @defgroup labrd labrd: step in gebrd + @defgroup gbbrd gbbrd: band to bidiagonal + @defgroup ungbr {un,or}gbr: generate Q, P from gebrd + @defgroup unmbr {un,or}mbr: multiply by Q, P from gebrd + + @defgroup gesvd_aux --- auxiliary routines --- + @defgroup gsvj0 gsvj0: step in gesvj + @defgroup gsvj1 gsvj1: step in gesvj + @defgroup las2 las2: 2x2 triangular SVD + @defgroup lasv2 lasv2: 2x2 triangular SVD + @defgroup lartgs lartgs: generate plane rotation for bidiag SVD + @} + + @defgroup ggsvd_comp_grp Generalized SVD computational routines + @{ + @defgroup ggsvp3 ggsvp3: step in ggsvd + @defgroup tgsja tgsja: generalized SVD of trapezoidal matrices, step in ggsvd3 + @defgroup lags2 lags2: 2x2 orthogonal factor, step in tgsja + @defgroup lapll lapll: linear dependence of 2 vectors + @} + + @defgroup lasq_comp_grp bidiag QR iteration routines + @{ + @defgroup lasq1 lasq1: dqds step + @defgroup lasq2 lasq2: dqds step + @defgroup lasq3 lasq3: dqds step + @defgroup lasq4 lasq4: dqds step + @defgroup lasq5 lasq5: dqds step + @defgroup lasq6 lasq6: dqds step + @} + + @defgroup lasd_comp_grp bidiag D&C routines + @{ + @defgroup lasd0 lasd0: D&C step: top level solver + @defgroup lasdt lasdt: D&C step: tree + @defgroup lasd1 lasd1: D&C step: merge subproblems + @defgroup lasd2 lasd2: D&C step: deflation + @defgroup lasd3 lasd3: D&C step: secular equation + @defgroup lasd4 lasd4: D&C step: secular equation nonlinear solver + @defgroup lasd5 lasd5: D&C step: secular equation, 2x2 + @defgroup lasdq lasdq: D&C step: leaf using bdsqr + + @defgroup lasd_comp2 --- singular values only or factored form --- + @defgroup lasda lasda: D&C step: top level solver + @defgroup lasd6 lasd6: D&C step: merge subproblems + @defgroup lasd7 lasd7: D&C step: deflation + @defgroup lasd8 lasd8: D&C step: secular equation + @} + @} + + @defgroup blas_like_top BLAS-like + @{ + @defgroup set_grp Initialize, copy, convert + @{ + @defgroup laset laset: set matrix + @defgroup larnv larnv: random vector + @defgroup laruv laruv: random uniform vector + + @defgroup lacpy lacpy: copy matrix + + @defgroup lacp2 lacp2: general matrix, convert real to complex + @defgroup _lag2_ _lag2_: general matrix, convert double <=> single + @defgroup _lat2_ _lat2_: triangular matrix, convert double <=> single + + @defgroup tfttp tfttp: triangular matrix, RFP (tf) to packed (tp) + @defgroup tfttr tfttr: triangular matrix, RFP (tf) to full (tr) + @defgroup tpttf tpttf: triangular matrix, packed (tp) to RFP (tf) + @defgroup tpttr tpttr: triangular matrix, packed (tp) to full (tr) + @defgroup trttf trttf: triangular matrix, full (tr) to RFP (tf) + @defgroup trttp trttp: triangular matrix, full (tr) to packed (tp) + @} + + @defgroup norm_grp Matrix norm + @{ + @defgroup lange lange: general matrix + @defgroup langb langb: general matrix, banded + @defgroup langt langt: general matrix, tridiagonal + + @defgroup lanhs lanhs: Hessenberg + + @defgroup lanhe lan{he,sy}: Hermitian/symmetric matrix + @defgroup lanhf lan{hf,sf}: Hermitian/symmetric matrix, RFP + @defgroup lanhp lan{hp,sp}: Hermitian/symmetric matrix, packed + @defgroup lanhb lan{hb,sb}: Hermitian/symmetric matrix, banded + @defgroup lanht lan{ht,st}: Hermitian/symmetric matrix, tridiagonal + + @defgroup lantr lantr: triangular matrix + @defgroup lantp lantp: triangular matrix, packed + @defgroup lantb lantb: triangular matrix, banded + @} + + @defgroup blas0_like_grp Scalar operations + @{ + @defgroup isnan isnan: test for NaN + @defgroup laisnan laisnan: test for NaN, unoptimized + @defgroup ladiv ladiv: complex divide + @defgroup lapy2 lapy2: robust sqrt( x^2 + y^2 ) + @defgroup lapy3 lapy3: robust sqrt( x^2 + y^2 + z^2 ) + @defgroup larmm larmm: scale factor to avoid overflow, step in latrs + @} + + @defgroup blas1_like_grp Level 1 BLAS-like vector ops + @{ + @defgroup lacgv lacgv: conjugate vector + @defgroup lasrt lasrt: sort vector + @defgroup lassq lassq: sum-of-squares, avoiding over/underflow + @defgroup rscl rscl: scale vector by reciprocal + @} + + @defgroup blas2_like_grp Level 2 BLAS-like matrix-vector ops + @{ + @defgroup ilalc ilalc: find non-zero col + @defgroup ilalr ilalr: find non-zero row + @defgroup lascl lascl: scale matrix + @defgroup la_geamv la_geamv: matrix-vector multiply |A| * |x|, general + @defgroup la_gbamv la_gbamv: matrix-vector multiply |A| * |x|, general banded + @defgroup la_heamv la_heamv: matrix-vector multiply |A| * |x|, Hermitian/symmetric + @defgroup lascl2 lascl2: diagonal scale matrix, A = D A + @defgroup larscl2 larscl2: reciprocal diagonal scale matrix, A = D^{-1} A + @defgroup la_wwaddw la_wwaddw: add to double-double or single-single vector + @} + + @defgroup blas3_like_grp Level 3 BLAS-like matrix-matrix ops + @{ + @defgroup lagtm lagtm: tridiagonal matrix-matrix multiply + @defgroup lacrm lacrm: complex * real matrix-matrix multiply + @defgroup larcm larcm: real * complex matrix-matrix multiply + @defgroup hfrk hfrk: Hermitian rank-k update, RFP format + @defgroup tfsm tfsm: triangular-matrix solve, RFP format + @} + @} + + @defgroup aux_top Auxiliary routines + @{ + @defgroup aux_grp Other auxiliary routines + @{ + @defgroup lsame lsame: string comparison + @defgroup lsamen lsamen: string comparison + @defgroup roundup_lwork roundup_lwork: fix rounding integer to float + @defgroup second second: wall clock timer + @} + + @defgroup params_grp Parameters + @{ + @defgroup lamch lamch: machine parameters + @defgroup lamc1 lamc1: ?? + @defgroup lamc2 lamc2: ?? + @defgroup lamc3 lamc3: ?? + @defgroup lamc4 lamc4: ?? + @defgroup lamc5 lamc5: ?? + @defgroup labad labad: over/underflow on obsolete pre-IEEE machines + @defgroup ilaver ilaver: LAPACK version + @defgroup ilaenv ilaenv: tuning parameters + @defgroup ilaenv2stage ilaenv2stage: tuning parameters for 2-stage eig + @defgroup iparam2stage iparam2stage: sets parameters for 2-stage eig + @defgroup ieeeck ieeeck: verify inf and NaN are safe + @defgroup la_constants la_constants: Fortran 95 module of constants + + @defgroup blast_aux --- BLAST constants --- + @defgroup iladiag iladiag: diag string to BLAST const + @defgroup ilaprec ilaprec: precision string to BLAST const + @defgroup ilatrans ilatrans: trans string to BLAST const + @defgroup ilauplo ilauplo: uplo string to BLAST const + @defgroup la_transtype la_transtype: BLAST const to string + @} + + @defgroup xerbla_grp Error reporting + @{ + @defgroup xerbla xerbla: error reporting + @defgroup xerbla_array xerbla_array: error reporting, callable from C + @} + @} +@} + +@defgroup blas_top BLAS +BLAS are defined by three papers: +Basic linear algebra subprograms for {FORTRAN} usage, Lawson et al, 1979. +An extended set of {FORTRAN} basic linear algebra subprograms, Dongarra et al, 1988. +A set of level 3 basic linear algebra subprograms, Dongarra et al, 1990. +Some BLAS-like routines (e.g., csymv, crot, csum1, icmax1) exist in +LAPACK rather than the classic BLAS. +These were extended by the Extra Precision BLAS (XBLAS, not documented here) +https://www.netlib.org/xblas/ + +@{ + @defgroup scalar_grp Scalar operations + @{ + @defgroup abs1 abs1: | real( x ) | + | imag( x ) | + @} + + @defgroup blas1_grp Level 1 BLAS: vector ops + @{ + @defgroup asum asum: sum | real( x_i ) | + | imag( x_i ) | + @defgroup sum1 sum1: sum | x_i | (in LAPACK) + @defgroup axpy axpy: y = ax + y + @defgroup copy copy: y = x + @defgroup dot dot: x^H x and x^T x + @defgroup iamax iamax: argmax_i | real( x_i ) | + | imag( x_i ) | + @defgroup imax1 imax1: argmax_i | x_i | (in LAPACK) + @defgroup nrm2 nrm2: || x ||_2 + @defgroup scal scal: x = alpha x + @defgroup swap swap: x <=> y + + @defgroup rot_comp --- Givens/Jacobi plane rotations --- + @defgroup rot rot: apply plane rotation ([cz]rot in LAPACK) + @defgroup rotg rotg: generate plane rotation (cf. lartg) + @defgroup rotm rotm: apply modified (fast) plane rotation + @defgroup rotmg rotmg: generate modified (fast) plane rotation + @} + + @defgroup blas2_grp Level 2 BLAS: matrix-vector ops + @{ + @defgroup blas2_full --- full --- + @defgroup gemv gemv: general matrix-vector multiply + @defgroup ger ger: general matrix rank-1 update + + @defgroup hemv {he,sy}mv: Hermitian/symmetric matrix-vector multiply ([cz]symv in LAPACK) + @defgroup her {he,sy}r: Hermitian/symmetric rank-1 update + @defgroup her2 {he,sy}r2: Hermitian/symmetric rank-2 update + + @defgroup trmv trmv: triangular matrix-vector multiply + @defgroup trsv trsv: triangular matrix-vector solve + + @defgroup blas2_packed --- packed --- + @defgroup hpmv {hp,sp}mv: Hermitian/symmetric matrix-vector multiply + @defgroup hpr {hp,sp}r: Hermitian/symmetric rank-1 update + @defgroup hpr2 {hp,sp}r2: Hermitian/symmetric rank-2 update + + @defgroup tpmv tpmv: triangular matrix-vector multiply + @defgroup tpsv tpsv: triangular matrix-vector solve + + @defgroup blas2_banded --- banded --- + @defgroup gbmv gbmv: general matrix-vector multiply + + @defgroup hbmv {hb,sb}mv: Hermitian/symmetric matrix-vector multiply + + @defgroup tbmv tbmv: triangular matrix-vector multiply + @defgroup tbsv tbsv: triangular matrix-vector solve + @} + + @defgroup blas3_grp Level 3 BLAS: matrix-matrix ops + @{ + @defgroup gemm gemm: general matrix-matrix multiply + + @defgroup hemm {he,sy}mm: Hermitian/symmetric matrix-matrix multiply + @defgroup herk {he,sy}rk: Hermitian/symmetric rank-k update + @defgroup her2k {he,sy}r2k: Hermitian/symmetric rank-2k update + + @defgroup trmm trmm: triangular matrix-matrix multiply + @defgroup trsm trsm: triangular matrix-matrix solve + @} +@} + **/ From 3d9e20f61455ca6893744c2f9a0e19d89d9c5998 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Nov 2023 18:51:54 +0100 Subject: [PATCH 437/718] Update version to 3.12.0 --- lapack-netlib/INSTALL/ilaver.f | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f index a246c37cb..1827d5cd2 100644 --- a/lapack-netlib/INSTALL/ilaver.f +++ b/lapack-netlib/INSTALL/ilaver.f @@ -44,7 +44,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 * *> \ingroup auxOTHERauxiliary * @@ -60,7 +59,7 @@ INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH * ===================================================================== VERS_MAJOR = 3 - VERS_MINOR = 11 + VERS_MINOR = 12 VERS_PATCH = 0 * ===================================================================== * From 578f0f95901617941c7de5ddc62fa97f12f9c0a0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Nov 2023 18:53:16 +0100 Subject: [PATCH 438/718] Update version number to 3.12.0 --- lapack-netlib/INSTALL/ilaver.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lapack-netlib/INSTALL/ilaver.c b/lapack-netlib/INSTALL/ilaver.c index 184e9b78d..d64c841a2 100644 --- a/lapack-netlib/INSTALL/ilaver.c +++ b/lapack-netlib/INSTALL/ilaver.c @@ -315,7 +315,6 @@ typedef struct Namelist Namelist; /* > \author Univ. of Colorado Denver */ /* > \author NAG Ltd. */ -/* > \date November 2019 */ /* > \ingroup auxOTHERauxiliary */ @@ -332,7 +331,7 @@ typedef struct Namelist Namelist; /* ===================================================================== */ *vers_major__ = 3; - *vers_minor__ = 11; + *vers_minor__ = 12; *vers_patch__ = 0; /* ===================================================================== */ From 08be9004f89e41607e4c19de1e3d1018ee536728 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Nov 2023 18:57:17 +0100 Subject: [PATCH 439/718] Update version number and copyright date to Reference-LAPACK 3.12.0 --- lapack-netlib/CMakeLists.txt | 240 ++++++++++++++++------------------- lapack-netlib/LICENSE | 6 +- lapack-netlib/README.md | 3 +- 3 files changed, 111 insertions(+), 138 deletions(-) diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt index fefaa8b89..f1f47ae24 100644 --- a/lapack-netlib/CMakeLists.txt +++ b/lapack-netlib/CMakeLists.txt @@ -1,9 +1,9 @@ -cmake_minimum_required(VERSION 3.2) +cmake_minimum_required(VERSION 3.6) -project(LAPACK Fortran C) +project(LAPACK) set(LAPACK_MAJOR_VERSION 3) -set(LAPACK_MINOR_VERSION 11) +set(LAPACK_MINOR_VERSION 12) set(LAPACK_PATCH_VERSION 0) set( LAPACK_VERSION @@ -45,6 +45,14 @@ if(_is_coverage_build) find_package(codecov) endif() +# Use valgrind if it is found +option( LAPACK_TESTING_USE_PYTHON "Use Python for testing. Disable it on memory checks." ON ) +find_program( MEMORYCHECK_COMMAND valgrind ) +if( MEMORYCHECK_COMMAND ) + message( STATUS "Found valgrind: ${MEMORYCHECK_COMMAND}" ) + set( MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --show-leak-kinds=all --track-origins=yes" ) +endif() + # By default test Fortran compiler complex abs and complex division option(TEST_FORTRAN_COMPILER "Test Fortran compiler complex abs and complex division" OFF) if( TEST_FORTRAN_COMPILER ) @@ -76,7 +84,7 @@ if( TEST_FORTRAN_COMPILER ) WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL COMMENT "Running test_zminMax in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zminMax.err" SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f ) - + endif() # By default static library @@ -99,6 +107,8 @@ else() set(LAPACKELIB "lapacke") set(TMGLIB "tmglib") endif() +# By default build standard API and extended _64 API +option(BUILD_INDEX64_EXT_API "Build Index-64 API as extended API with _64 suffix" ON) include(GNUInstallDirs) @@ -127,90 +137,6 @@ configure_file( include(PreventInSourceBuilds) include(PreventInBuildInstalls) -# Check if recursive flag exists -include(CheckFortranCompilerFlag) -if(CMAKE_Fortran_COMPILER_ID STREQUAL Flang) - check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag) -elseif(CMAKE_Fortran_COMPILER_ID STREQUAL GNU) - check_fortran_compiler_flag("-frecursive" _frecursiveFlag) -elseif(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) - check_fortran_compiler_flag("-recursive" _recursiveFlag) -elseif(CMAKE_Fortran_COMPILER_ID STREQUAL XL) - check_fortran_compiler_flag("-qrecur" _qrecurFlag) -elseif(CMAKE_Fortran_COMPILER_ID STREQUAL NAG) - check_fortran_compiler_flag("-recursive" _recursiveFlag) -else() - message(WARNING "Fortran local arrays should be allocated on the stack." - " Please use a compiler which guarantees that feature." - " See https://github.com/Reference-LAPACK/lapack/pull/188 and references therein.") -endif() - -# Add recursive flag -if(_MrecursiveFlag) - string(REGEX MATCH "-Mrecursive" output_test "${CMAKE_Fortran_FLAGS}") - if(NOT output_test) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive" - CACHE STRING "Recursive flag must be set" FORCE) - endif() -elseif(_frecursiveFlag) - string(REGEX MATCH "-frecursive" output_test "${CMAKE_Fortran_FLAGS}") - if(NOT output_test) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -frecursive" - CACHE STRING "Recursive flag must be set" FORCE) - endif() -elseif(_recursiveFlag) - string(REGEX MATCH "-recursive" output_test "${CMAKE_Fortran_FLAGS}") - if(NOT output_test) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive" - CACHE STRING "Recursive flag must be set" FORCE) - endif() -elseif(_qrecurFlag) - string(REGEX MATCH "-qrecur" output_test "${CMAKE_Fortran_FLAGS}") - if(NOT output_test) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qrecur" - CACHE STRING "Recursive flag must be set" FORCE) - endif() -endif() - -if(UNIX) - if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict") - endif() - if(CMAKE_Fortran_COMPILER_ID STREQUAL XL) - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict") - endif() -# Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler. -# This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin - string(REPLACE \;mtsk\; \; CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}") -endif() - -if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq) - if(WIN32) - if(CMAKE_GENERATOR STREQUAL "NMake Makefiles") - get_filename_component(CMAKE_Fortran_COMPILER_CMDNAM ${CMAKE_Fortran_COMPILER} NAME_WE) - message(STATUS "Using Compaq Fortran compiler with command name ${CMAKE_Fortran_COMPILER_CMDNAM}") - set(cmd ${CMAKE_Fortran_COMPILER_CMDNAM}) - string(TOLOWER "${cmd}" cmdlc) - if(cmdlc STREQUAL "df") - message(STATUS "Assume the Compaq Visual Fortran Compiler is being used") - set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) - set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_INCLUDES 1) - #This is a workaround that is needed to avoid forward-slashes in the - #filenames listed in response files from incorrectly being interpreted as - #introducing compiler command options - if(${BUILD_SHARED_LIBS}) - message(FATAL_ERROR "Making of shared libraries with CVF has not been tested.") - endif() - set(str "NMake version 9 or later should be used. NMake version 6.0 which is\n") - set(str "${str} included with the CVF distribution fails to build Lapack because\n") - set(str "${str} the number of source files exceeds the limit for NMake v6.0\n") - message(STATUS ${str}) - set(CMAKE_Fortran_LINK_EXECUTABLE "LINK /out: ") - endif() - endif() - endif() -endif() - # Add option to enable flat namespace for symbol resolution on macOS if(APPLE) option(USE_FLAT_NAMESPACE "Use flat namespaces for symbol resolution during build and runtime." OFF) @@ -268,26 +194,6 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/bin) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib) -# -------------------------------------------------- -# Check for any necessary platform specific compiler flags -include(CheckLAPACKCompilerFlags) -CheckLAPACKCompilerFlags() - -# -------------------------------------------------- -# Check second function - -include(CheckTimeFunction) -set(TIME_FUNC NONE) -CHECK_TIME_FUNCTION(NONE TIME_FUNC) -CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC) -CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC) -CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC) -CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC) -message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.") - -set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f) -set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f) - # deprecated LAPACK and LAPACKE routines option(BUILD_DEPRECATED "Build deprecated routines" OFF) message(STATUS "Build deprecated routines: ${BUILD_DEPRECATED}") @@ -380,18 +286,27 @@ endif() # Check the usage of the user provided or automatically found LAPACK libraries if(LAPACK_LIBRARIES) - include(CheckFortranFunctionExists) - set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) - # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES - CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND) - unset(CMAKE_REQUIRED_LIBRARIES) - if(LATESTLAPACK_FOUND) - message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.") + include(CheckLanguage) + check_language(Fortran) + if(CMAKE_Fortran_COMPILER) + enable_language(Fortran) + include(CheckFortranFunctionExists) + set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) + # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES + CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND) + unset(CMAKE_REQUIRED_LIBRARIES) + if(LATESTLAPACK_FOUND) + message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.") + else() + message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.") + message(ERROR "--> Will use REFERENCE LAPACK (by default)") + message(ERROR "--> Or Correct your LAPACK_LIBRARIES entry ") + message(ERROR "--> Or Consider checking USE_OPTIMIZED_LAPACK") + endif() else() - message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.") - message(ERROR "--> Will use REFERENCE LAPACK (by default)") - message(ERROR "--> Or Correct your LAPACK_LIBRARIES entry ") - message(ERROR "--> Or Consider checking USE_OPTIMIZED_LAPACK") + message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.") + message(STATUS "--> CMake couldn't find a Fortran compiler, so it cannot check if the provided LAPACK library works.") + set(LATESTLAPACK_FOUND TRUE) endif() endif() @@ -399,6 +314,27 @@ endif() if(NOT LATESTLAPACK_FOUND) message(STATUS "Using supplied NETLIB LAPACK implementation") set(LAPACK_LIBRARIES ${LAPACKLIB}) + + enable_language(Fortran) + + # Check for any necessary platform specific compiler flags + include(CheckLAPACKCompilerFlags) + CheckLAPACKCompilerFlags() + + # Check second function + include(CheckTimeFunction) + set(TIME_FUNC NONE) + CHECK_TIME_FUNCTION(NONE TIME_FUNC) + CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC) + CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC) + CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC) + CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC) + + # Set second function + message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.") + set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f) + set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f) + add_subdirectory(SRC) else() set(CMAKE_EXE_LINKER_FLAGS @@ -431,9 +367,11 @@ endif() # Cache export target set(LAPACK_INSTALL_EXPORT_NAME_CACHE ${LAPACK_INSTALL_EXPORT_NAME}) if(BUILD_TESTING OR LAPACKE_WITH_TMG) + enable_language(Fortran) if(LATESTLAPACK_FOUND AND LAPACKE_WITH_TMG) set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) # Check if dlatms (part of tmg) is found + include(CheckFortranFunctionExists) CHECK_FORTRAN_FUNCTION_EXISTS("dlatms" LAPACK_WITH_TMGLIB_FOUND) unset(CMAKE_REQUIRED_LIBRARIES) if(NOT LAPACK_WITH_TMGLIB_FOUND) @@ -448,6 +386,12 @@ endif() set(LAPACK_INSTALL_EXPORT_NAME ${LAPACK_INSTALL_EXPORT_NAME_CACHE}) unset(LAPACK_INSTALL_EXPORT_NAME_CACHE) + +#------------------------------------- +# LAPACKE +# Include lapack.h and lapacke_mangling.h even if LAPACKE is not built +add_subdirectory(LAPACKE/include) + if(LAPACKE) add_subdirectory(LAPACKE) endif() @@ -474,8 +418,8 @@ if (BLAS++) ExternalProject_Add(blaspp URL https://bitbucket.org/icl/blaspp/downloads/blaspp-2020.10.02.tar.gz CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp - BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make - INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install + BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build . + INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install . ) ExternalProject_Add_StepDependencies(blaspp build ${BLAS_LIBRARIES}) endif() @@ -487,16 +431,16 @@ if (LAPACK++) ExternalProject_Add(lapackpp URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp - BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make - INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install + BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build . + INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install . ) else () # FIXME this does not really work as the libraries list gets converted to a semicolon-separated list somewhere in the lapack++ build files ExternalProject_Add(lapackpp URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz CONFIGURE_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES="${PROJECT_BINARY_DIR}/lib/liblapack.a -lgfortran" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp - BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make - INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install + BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build . + INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install . ) endif() ExternalProject_Add_StepDependencies(lapackpp build blaspp ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) @@ -671,22 +615,34 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION) set(DOXYGEN_PROJECT_BRIEF "LAPACK: Linear Algebra PACKage") set(DOXYGEN_PROJECT_NUMBER ${LAPACK_VERSION}) set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DOCS) - set(PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png) + set(DOXYGEN_PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png) set(DOXYGEN_OPTIMIZE_FOR_FORTRAN YES) set(DOXYGEN_SOURCE_BROWSER YES) - set(DISTRIBUTE_GROUP_DOC YES) set(DOXYGEN_CREATE_SUBDIRS YES) set(DOXYGEN_SEPARATE_MEMBER_PAGES YES) set(DOXYGEN_EXTRACT_ALL YES) - set(DOXYGEN_FILE_PATTERNS "*.f;*.c;*.h") + set(DOXYGEN_FILE_PATTERNS *.f *.f90 *.c *.h ) set(DOXYGEN_RECURSIVE YES) set(DOXYGEN_GENERATE_TREEVIEW YES) + set(DOXYGEN_DOT_IMAGE_FORMAT svg) set(DOXYGEN_INTERACTIVE_SVG YES) - set(DOXYGEN_QUIET YES) - set(DOXYGEN_WARNINGS NO) + set(DOXYGEN_QUIET NO) + set(DOXYGEN_WARNINGS YES) + set(DOXYGEN_WARN_NO_PARAMDOC YES) + set(DOXYGEN_WARN_LOGFILE doxygen_error) set(DOXYGEN_GENERATE_HTML NO) set(DOXYGEN_GENERATE_MAN NO) - + set(DOXYGEN_LAYOUT_FILE "DOCS/DoxygenLayout.xml") + + # Exclude functions that are duplicated, creating conflicts. + set(DOXYGEN_EXCLUDE .git + .github + SRC/VARIANTS + BLAS/SRC/lsame.f + BLAS/SRC/xerbla.f + BLAS/SRC/xerbla_array.f + INSTALL/slamchf77.f + INSTALL/dlamchf77.f ) if (BUILD_HTML_DOCUMENTATION) set(DOXYGEN_GENERATE_HTML YES) @@ -697,13 +653,21 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION) doxygen_add_docs( html - ${PROJECT_SOURCE_DIR} + + # Doxygen INPUT = + ${PROJECT_SOURCE_DIR}/README.md + ${PROJECT_SOURCE_DIR}/BLAS + ${PROJECT_SOURCE_DIR}/CBLAS + ${PROJECT_SOURCE_DIR}/SRC + ${PROJECT_SOURCE_DIR}/INSTALL + ${PROJECT_SOURCE_DIR}/TESTING + ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox + COMMENT "Generating html LAPACK documentation (it will take some time... time to grab a coffee)" ) endif() if (BUILD_MAN_DOCUMENTATION) set(DOXYGEN_GENERATE_MAN YES) - set(DOXYGEN_EXCLUDE SRC/VARIANTS) set(DOXYGEN_MAN_LINKS YES) set(DOXYGEN_INLINE_SOURCES NO) set(DOXYGEN_CALL_GRAPH NO) @@ -711,7 +675,15 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION) doxygen_add_docs( man - ${PROJECT_SOURCE_DIR} + + # Doxygen INPUT = + ${PROJECT_SOURCE_DIR}/BLAS + ${PROJECT_SOURCE_DIR}/CBLAS + ${PROJECT_SOURCE_DIR}/SRC + ${PROJECT_SOURCE_DIR}/INSTALL + ${PROJECT_SOURCE_DIR}/TESTING + ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox + COMMENT "Generating man LAPACK documentation" ) endif() diff --git a/lapack-netlib/LICENSE b/lapack-netlib/LICENSE index 94cdb0f85..96b04c988 100644 --- a/lapack-netlib/LICENSE +++ b/lapack-netlib/LICENSE @@ -1,9 +1,9 @@ -Copyright (c) 1992-2017 The University of Tennessee and The University +Copyright (c) 1992-2023 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. -Copyright (c) 2000-2017 The University of California Berkeley. All +Copyright (c) 2000-2023 The University of California Berkeley. All rights reserved. -Copyright (c) 2006-2017 The University of Colorado Denver. All rights +Copyright (c) 2006-2023 The University of Colorado Denver. All rights reserved. $COPYRIGHT$ diff --git a/lapack-netlib/README.md b/lapack-netlib/README.md index 142aa7b72..a00d4c51d 100644 --- a/lapack-netlib/README.md +++ b/lapack-netlib/README.md @@ -6,7 +6,7 @@ [![Appveyor](https://ci.appveyor.com/api/projects/status/bh38iin398msrbtr?svg=true)](https://ci.appveyor.com/project/langou/lapack/) [![codecov](https://codecov.io/gh/Reference-LAPACK/lapack/branch/master/graph/badge.svg)](https://codecov.io/gh/Reference-LAPACK/lapack) [![Packaging status](https://repology.org/badge/tiny-repos/lapack.svg)](https://repology.org/metapackage/lapack/versions) - +[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/Reference-LAPACK/lapack/badge)](https://securityscorecards.dev/viewer/?uri=github.com/Reference-LAPACK/lapack) * VERSION 1.0 : February 29, 1992 * VERSION 1.0a : June 30, 1992 @@ -37,6 +37,7 @@ * VERSION 3.10.0 : June 2021 * VERSION 3.10.1 : April 2022 * VERSION 3.11.0 : November 2022 +* VERSION 3.12.0 : November 2023 LAPACK is a library of Fortran subroutines for solving the most commonly occurring problems in numerical linear algebra. From 01c7010543d1abb5e99da0b97b731aff1acd5aa4 Mon Sep 17 00:00:00 2001 From: "Kirill A. Korinsky" Date: Mon, 27 Nov 2023 14:28:08 +0000 Subject: [PATCH 440/718] cmake/openblas.pc.in: fixed version and URL --- cmake/openblas.pc.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 7e120af86..11e5606e5 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -5,7 +5,7 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version -Version: @OPENBLAS_VERSION@ -URL: https://github.com/xianyi/OpenBLAS +Version: @OpenBLAS_VERSION@ +URL: https://github.com/OpenMathLib/OpenBLAS Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} From 9beee55167c80d7fee7aadb78856c27c7d4b4fe2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Korinsky" Date: Mon, 27 Nov 2023 16:54:49 +0000 Subject: [PATCH 441/718] Enable overstep of too long args without DYNAMIC_ARCH --- CMakeLists.txt | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 69077322a..7c6b96f41 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -249,20 +249,21 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago endif() endif() -if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) +# Seems that this hack doesn't required since macOS 11 Big Sur +if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) if (NOT NOFORTRAN) set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) set (CMAKE_Fortran_CREATE_SHARED_LIBRARY - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " - "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " + "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " - "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " + "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() From ff92e6e7071aa09dcf0f3d7196bdf53d1bbfc422 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Nov 2023 12:53:35 +0100 Subject: [PATCH 442/718] Fix installation location of lapacke_mangling header --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c6b96f41..6b02864ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -542,7 +542,7 @@ if(NOT NO_LAPACKE) ADD_CUSTOM_TARGET(genlapacke COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() # Install pkg-config files From 1310a0931bc614d8b8bc211ddb82d72cd64aee43 Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Wed, 15 Nov 2023 16:54:06 +0800 Subject: [PATCH 443/718] loongarch: Refine build control for loongarch64. 1. Use getauxval instead of cpucfg to test hardware capability. 2. Remove unnecessary code and option for compiler check in c_check. --- c_check | 6 ++---- c_check.pl | 6 ++---- cpuid_loongarch64.c | 16 +++++++++++----- driver/others/dynamic_loongarch64.c | 19 +++++++------------ 4 files changed, 22 insertions(+), 25 deletions(-) diff --git a/c_check b/c_check index b018c10a8..b5e4a9ad0 100755 --- a/c_check +++ b/c_check @@ -199,8 +199,7 @@ if [ "$architecture" = "loongarch64" ]; then tmpd="$(mktemp -d)" tmplsx="$tmpd/lsx.c" codelsx='"vadd.b $vr0, $vr0, $vr0"' - lsx_flags='-march=loongarch64 -mlsx' - printf "#include \n\n" >> "$tmplsx" + lsx_flags='-march=loongarch64' printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" args="$lsx_flags -o $tmplsx.o $tmplsx" { @@ -211,8 +210,7 @@ if [ "$architecture" = "loongarch64" ]; then tmplasx="$tmpd/lasx.c" codelasx='"xvadd.b $xr0, $xr0, $xr0"' - lasx_flags='-march=loongarch64 -mlasx' - printf "#include \n\n" >> "$tmplasx" + lasx_flags='-march=loongarch64' printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" args="$lasx_flags -o $tmplasx.o $tmplasx" { diff --git a/c_check.pl b/c_check.pl index 7a860a211..d9c36793c 100644 --- a/c_check.pl +++ b/c_check.pl @@ -241,8 +241,7 @@ if (($architecture eq "loongarch64")) { } else { $tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); $codelsx = '"vadd.b $vr0, $vr0, $vr0"'; - $lsx_flags = "-march=loongarch64 -mlsx"; - print $tmplsx "#include \n\n"; + $lsx_flags = "-march=loongarch64"; print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; $args = "$lsx_flags -o $tmplsx.o $tmplsx"; @@ -257,8 +256,7 @@ if (($architecture eq "loongarch64")) { $tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); $codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; - $lasx_flags = "-march=loongarch64 -mlasx"; - print $tmplasx "#include \n\n"; + $lasx_flags = "-march=loongarch64"; print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; $args = "$lasx_flags -o $tmplasx.o $tmplasx"; diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index 7c389db27..0ad32ae4e 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_LOONGSON3R5 1 #define CPU_LOONGSON2K1000 2 -#define LA_HWCAP_LSX (1<<4) -#define LA_HWCAP_LASX (1<<5) +#define LA_HWCAP_LSX (1U << 4) +#define LA_HWCAP_LASX (1U << 5) static char *cpuname[] = { "LOONGSONGENERIC", @@ -64,11 +64,11 @@ static char *cpuname_lower[] = { int detect(void) { #ifdef __linux - int flag = (int)getauxval(AT_HWCAP); + int hwcap = (int)getauxval(AT_HWCAP); - if (flag & LA_HWCAP_LASX) + if (hwcap & LA_HWCAP_LASX) return CPU_LOONGSON3R5; - else if (flag & LA_HWCAP_LSX) + else if (hwcap & LA_HWCAP_LSX) return CPU_LOONGSON2K1000; else return CPU_GENERIC; @@ -94,7 +94,9 @@ void get_subdirname(void) { } void get_cpuconfig(void) { + uint32_t hwcaps = 0; int d = detect(); + switch (d) { case CPU_LOONGSON3R5: printf("#define LOONGSON3R5\n"); @@ -129,6 +131,10 @@ void get_cpuconfig(void) { printf("#define L2_ASSOCIATIVE 16\n"); break; } + + hwcaps = (uint32_t)getauxval( AT_HWCAP ); + if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n"); + if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n"); } void get_libname(void){ diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c index 52f8bcb2f..44de59669 100644 --- a/driver/others/dynamic_loongarch64.c +++ b/driver/others/dynamic_loongarch64.c @@ -25,6 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ +#include #include "common.h" extern gotoblas_t gotoblas_LOONGSON3R5; @@ -74,21 +75,15 @@ static gotoblas_t *force_coretype(char *coretype) { return NULL; } -#define LASX_MASK 1<<7 -#define LSX_MASK 1<<6 -#define LOONGARCH_CFG2 0x02 +#define LA_HWCAP_LSX (1U << 4) +#define LA_HWCAP_LASX (1U << 5) static gotoblas_t *get_coretype(void) { - int ret = 0; - __asm__ volatile ( - "cpucfg %0, %1 \n\t" - : "+&r"(ret) - : "r"(LOONGARCH_CFG2) - ); - - if (ret & LASX_MASK) + int hwcap = (int)getauxval(AT_HWCAP); + + if (hwcap & LA_HWCAP_LASX) return &gotoblas_LOONGSON3R5; - else if (ret & LSX_MASK) + else if (hwcap & LA_HWCAP_LSX) return &gotoblas_LOONGSON2K1000; else return &gotoblas_LOONGSONGENERIC; From 3def6a8143253a7b3c5b6477d803ecdb1165bfbc Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Wed, 15 Nov 2023 17:24:33 +0800 Subject: [PATCH 444/718] loongarch: Add LASX optimization for dot. --- common_loongarch64.h | 19 ++ kernel/loongarch64/KERNEL.LOONGSON2K1000 | 0 kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 + kernel/loongarch64/dot_lasx.S | 309 +++++++++++++++++++++++ 4 files changed, 332 insertions(+) create mode 100644 kernel/loongarch64/KERNEL.LOONGSON2K1000 create mode 100644 kernel/loongarch64/dot_lasx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index ce1fcf091..4963b2f07 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -124,7 +124,17 @@ static inline int WhereAmI(void){ #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d #define NEG fneg.d + +#define XVFSUB xvfsub.d +#define XVFADD xvfadd.d +#define XVFMADD xvfmadd.d + +#define VFSUB vfsub.d +#define VFADD vfadd.d +#define VFMADD vfmadd.d + #else + #define LD fld.s #define ST fst.s #define MADD fmadd.s @@ -142,6 +152,15 @@ static inline int WhereAmI(void){ #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s #define NEG fneg.s + +#define XVFSUB xvfsub.s +#define XVFADD xvfadd.s +#define XVFMADD xvfmadd.s + +#define VFSUB vfsub.s +#define VFADD vfadd.s +#define VFMADD vfmadd.s + #endif /* defined(DOUBLE) */ #if defined(__64BIT__) && defined(USE64BITINT) diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 new file mode 100644 index 000000000..e69de29bb diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 011e8b89e..d1b93b0ca 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -1,4 +1,8 @@ ifndef NO_LASX + +SDOTKERNEL = dot_lasx.S +DDOTKERNEL = dot_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S new file mode 100644 index 000000000..9d3cbf63d --- /dev/null +++ b/kernel/loongarch64/dot_lasx.S @@ -0,0 +1,309 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 + +#define I $r17 +#define TEMP $r18 + +/* Don't change following FR unless you know the effects. */ +#define s1 $f8 +#define s2 $f9 +#define a1 $f10 +#define b1 $f11 + +PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + SUB s1, s1, s1 + SUB s2, s2, s2 + slli.d INCX, INCX, BASE_SHIFT + li.d TEMP, SIZE + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + bne INCX, TEMP, .L20 /* inc_x=1 */ + bne INCY, TEMP, .L20 /* inc_y=1 */ +#ifdef DOUBLE + srai.d I, N, 4 +#else + srai.d I, N, 5 +#endif + + /* init $xr8 and $xr9 to zero */ +#ifdef DOUBLE + xvldrepl.d $xr0, X, 0 +#else + xvldrepl.w $xr0, X, 0 +#endif + XVFSUB $xr8, $xr0, $xr0 + XVFSUB $xr9, $xr0, $xr0 + +/* !((inc_x == 1) && (inc_y == 1)) */ + bge $r0, I, .L12 /* <32 */ +.L11: + /* case 32~ */ + xvld $xr0, X, 0 + xvld $xr1, X, 32 + xvld $xr2, X, 64 + xvld $xr3, X, 96 + xvld $xr4, Y, 0 + xvld $xr5, Y, 32 + xvld $xr6, Y, 64 + xvld $xr7, Y, 96 + addi.w I, I, -1 + addi.d X, X, 128 + addi.d Y, Y, 128 + XVFMADD $xr8, $xr0, $xr4, $xr8 + XVFMADD $xr9, $xr1, $xr5, $xr9 + XVFMADD $xr8, $xr2, $xr6, $xr8 + XVFMADD $xr9, $xr3, $xr7, $xr9 + bnez I, .L11 +.L12: +#ifdef DOUBLE + andi I, N, 0xf + srai.d I, I, 2 +#else + andi I, N, 0x1f + srai.d I, I, 3 +#endif + bge $r0, I, .L14 /* <8 */ +.L13: + /* case 8~31 */ + xvld $xr0, X, 0 + xvld $xr4, Y, 0 + addi.w I, I, -1 + addi.d X, X, 32 + addi.d Y, Y, 32 + XVFMADD $xr8, $xr0, $xr4, $xr8 + bnez I, .L13 +.L14: + /* store dot in s1 $f8 */ + XVFADD $xr8, $xr8, $xr9 + SUB s2, s2, s2 /* set s2 to 0.0 */ + xvpermi.q $xr0, $xr8, 0x1 + VFADD $vr8, $vr8, $vr0 + vpackod.d $vr0, $vr8, $vr8 +#ifdef DOUBLE + VFADD $vr8, $vr8, $vr0 +#else + VFADD $vr8, $vr8, $vr0 + vpackod.w $vr0, $vr8, $vr8 + VFADD $vr8, $vr8, $vr0 +#endif +.L15: +#ifdef DOUBLE + andi I, N, 0x3 +#else + andi I, N, 0x7 +#endif + bge $r0, I, .L999 /* =0 */ + .align 3 +.L16: + /* case 1~7 */ + LD a1, X, 0 + LD b1, Y, 0 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d I, I, -1 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + bnez I, .L16 + b .L999 + .align 3 + +.L20: +/* !((inc_x == 1) && (inc_y == 1)) */ + srai.d I, N, 3 +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 /* <8 */ + .align 3 + +.L23: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + blt $r0, I, .L23 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + blt $r0, I, .L26 + .align 3 + +.L999: +#ifdef DSDOT + fadd.d $f0, s1, s2 +#else + ADD $f0, s1, s2 +#endif + move $r4, $r17 + jirl $r0, $r1, 0x0 + +EPILOGUE From 13b8c44b44f7fc2eea4f1839b034802a8b1edcae Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Fri, 24 Nov 2023 16:40:32 +0800 Subject: [PATCH 445/718] loongarch: Add optimization for dsdot kernel. --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 5 +- kernel/loongarch64/dot_lasx.S | 83 +++++++++++++++++++++++---- 2 files changed, 74 insertions(+), 14 deletions(-) diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index d1b93b0ca..020a82303 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -1,7 +1,8 @@ ifndef NO_LASX -SDOTKERNEL = dot_lasx.S -DDOTKERNEL = dot_lasx.S +SDOTKERNEL = dot_lasx.S +DSDOTKERNEL = dot_lasx.S +DDOTKERNEL = dot_lasx.S DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S index 9d3cbf63d..0715b6311 100644 --- a/kernel/loongarch64/dot_lasx.S +++ b/kernel/loongarch64/dot_lasx.S @@ -51,6 +51,8 @@ PROLOGUE LDINT INCX, 0(INCX) LDINT INCY, 0(INCY) #endif + + /* init $f8 and $f9 to zero */ SUB s1, s1, s1 SUB s2, s2, s2 slli.d INCX, INCX, BASE_SHIFT @@ -59,11 +61,8 @@ PROLOGUE bge $r0, N, .L999 bne INCX, TEMP, .L20 /* inc_x=1 */ bne INCY, TEMP, .L20 /* inc_y=1 */ -#ifdef DOUBLE - srai.d I, N, 4 -#else - srai.d I, N, 5 -#endif + + /* !((inc_x == 1) && (inc_y == 1)) */ /* init $xr8 and $xr9 to zero */ #ifdef DOUBLE @@ -71,13 +70,24 @@ PROLOGUE #else xvldrepl.w $xr0, X, 0 #endif +#ifdef DSDOT + xvfcvtl.d.s $xr0, $xr0 + xvfsub.d $xr8, $xr0, $xr0 + xvfsub.d $xr9, $xr0, $xr0 +#else XVFSUB $xr8, $xr0, $xr0 XVFSUB $xr9, $xr0, $xr0 +#endif -/* !((inc_x == 1) && (inc_y == 1)) */ - bge $r0, I, .L12 /* <32 */ +#ifdef DOUBLE + srai.d I, N, 4 +#else + srai.d I, N, 5 +#endif + bge $r0, I, .L12 /* FLOAT: <32 ; DOUBLE: <16 */ + .align 3 .L11: - /* case 32~ */ + /* FLOAT: 32~ ; DOUBLE: 16~ */ xvld $xr0, X, 0 xvld $xr1, X, 32 xvld $xr2, X, 64 @@ -89,11 +99,39 @@ PROLOGUE addi.w I, I, -1 addi.d X, X, 128 addi.d Y, Y, 128 +#ifdef DSDOT + xvfcvtl.d.s $xr10, $xr0 + xvfcvtl.d.s $xr11, $xr4 + xvfcvth.d.s $xr12, $xr0 + xvfcvth.d.s $xr13, $xr4 + xvfmadd.d $xr8, $xr10, $xr12, $xr8 + xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfcvtl.d.s $xr10, $xr1 + xvfcvtl.d.s $xr11, $xr5 + xvfcvth.d.s $xr12, $xr1 + xvfcvth.d.s $xr13, $xr5 + xvfmadd.d $xr8, $xr10, $xr12, $xr8 + xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfcvtl.d.s $xr10, $xr2 + xvfcvtl.d.s $xr11, $xr6 + xvfcvth.d.s $xr12, $xr2 + xvfcvth.d.s $xr13, $xr6 + xvfmadd.d $xr8, $xr10, $xr12, $xr8 + xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfcvtl.d.s $xr10, $xr3 + xvfcvtl.d.s $xr11, $xr7 + xvfcvth.d.s $xr12, $xr3 + xvfcvth.d.s $xr13, $xr7 + xvfmadd.d $xr8, $xr10, $xr12, $xr8 + xvfmadd.d $xr9, $xr11, $xr13, $xr9 +#else XVFMADD $xr8, $xr0, $xr4, $xr8 XVFMADD $xr9, $xr1, $xr5, $xr9 XVFMADD $xr8, $xr2, $xr6, $xr8 XVFMADD $xr9, $xr3, $xr7, $xr9 +#endif bnez I, .L11 + .align 3 .L12: #ifdef DOUBLE andi I, N, 0xf @@ -102,18 +140,37 @@ PROLOGUE andi I, N, 0x1f srai.d I, I, 3 #endif - bge $r0, I, .L14 /* <8 */ + bge $r0, I, .L14 /* DOUBLE: <4 ; FLOAT: <8 */ + .align 3 .L13: - /* case 8~31 */ + /* FLOAT: 8~31 ; DOUBLE: 4~15 */ xvld $xr0, X, 0 xvld $xr4, Y, 0 addi.w I, I, -1 addi.d X, X, 32 addi.d Y, Y, 32 +#ifdef DSDOT + xvfcvtl.d.s $xr10, $xr0 + xvfcvtl.d.s $xr11, $xr4 + xvfcvth.d.s $xr12, $xr0 + xvfcvth.d.s $xr13, $xr4 + xvfmadd.d $xr8, $xr10, $xr12, $xr8 + xvfmadd.d $xr9, $xr11, $xr13, $xr9 +#else XVFMADD $xr8, $xr0, $xr4, $xr8 +#endif bnez I, .L13 + .align 3 .L14: /* store dot in s1 $f8 */ +#ifdef DSDOT + xvfadd.d $xr8, $xr8, $xr9 + fsub.s s2, s2, s2, /* set s2 to 0.0 */ + xvpermi.q $xr0, $xr8, 0x1 + vfadd.d $vr8, $vr8, $vr0 + vpackod.d $vr0, $vr8, $vr8 + vfadd.d $vr8, $vr8, $vr0 +#else XVFADD $xr8, $xr8, $xr9 SUB s2, s2, s2 /* set s2 to 0.0 */ xvpermi.q $xr0, $xr8, 0x1 @@ -125,7 +182,9 @@ PROLOGUE VFADD $vr8, $vr8, $vr0 vpackod.w $vr0, $vr8, $vr8 VFADD $vr8, $vr8, $vr0 -#endif +#endif /* defined DOUBLE */ +#endif /* defined DSDOT */ + .align 3 .L15: #ifdef DOUBLE andi I, N, 0x3 @@ -135,7 +194,7 @@ PROLOGUE bge $r0, I, .L999 /* =0 */ .align 3 .L16: - /* case 1~7 */ + /* FLOAT: 1~7 ; DOUBLE: 1~3 */ LD a1, X, 0 LD b1, Y, 0 #ifdef DSDOT From 9fe07d82fdeb4c2d81bdf57415ce1db157109bb1 Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Fri, 24 Nov 2023 17:57:14 +0800 Subject: [PATCH 446/718] loongarch: Add LSX optimization for dot. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 7 + kernel/loongarch64/dot_lsx.S | 364 +++++++++++++++++++++++ 2 files changed, 371 insertions(+) create mode 100644 kernel/loongarch64/dot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index e69de29bb..b2a396674 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -0,0 +1,7 @@ +ifndef NO_LSX + +SDOTKERNEL = dot_lsx.S +DSDOTKERNEL = dot_lsx.S +DDOTKERNEL = dot_lsx.S + +endif diff --git a/kernel/loongarch64/dot_lsx.S b/kernel/loongarch64/dot_lsx.S new file mode 100644 index 000000000..f7f613553 --- /dev/null +++ b/kernel/loongarch64/dot_lsx.S @@ -0,0 +1,364 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 + +#define I $r17 +#define TEMP $r18 + +/* Don't change following FR unless you know the effects. */ +#define s1 $f8 +#define s2 $f9 +#define a1 $f10 +#define b1 $f11 + +PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + /* init $f8 and $f9 to zero */ + SUB s1, s1, s1 + SUB s2, s2, s2 + slli.d INCX, INCX, BASE_SHIFT + li.d TEMP, SIZE + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + bne INCX, TEMP, .L20 /* inc_x=1 */ + bne INCY, TEMP, .L20 /* inc_y=1 */ + + /* !((inc_x == 1) && (inc_y == 1)) */ + + /* init $vr8 and $vr9 to zero */ +#ifdef DOUBLE + vldrepl.d $vr0, X, 0 +#else + vldrepl.w $vr0, X, 0 +#endif +#ifdef DSDOT + vfcvtl.d.s $vr0, $vr0 + vfsub.d $vr8, $vr0, $vr0 + vfsub.d $vr9, $vr0, $vr0 +#else + VFSUB $vr8, $vr0, $vr0 + VFSUB $vr9, $vr0, $vr0 +#endif + +#ifdef DOUBLE + srai.d I, N, 3 +#else + srai.d I, N, 4 +#endif + bge $r0, I, .L12 /* FLOAT: <16 ; DOUBLE: <8 */ + .align 3 +.L11: + /* FLOAT: 16~ ; DOUBLE: 8~ */ + vld $vr0, X, 0 + vld $vr1, X, 16 + vld $vr2, X, 32 + vld $vr3, X, 48 + vld $vr4, Y, 0 + vld $vr5, Y, 16 + vld $vr6, Y, 32 + vld $vr7, Y, 48 + addi.w I, I, -1 + addi.d X, X, 64 + addi.d Y, Y, 64 +#ifdef DSDOT + vfcvtl.d.s $vr10, $vr0 + vfcvtl.d.s $vr11, $vr4 + vfcvth.d.s $vr12, $vr0 + vfcvth.d.s $vr13, $vr4 + vfmadd.d $vr8, $vr10, $vr12, $vr8 + vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfcvtl.d.s $vr10, $vr1 + vfcvtl.d.s $vr11, $vr5 + vfcvth.d.s $vr12, $vr1 + vfcvth.d.s $vr13, $vr5 + vfmadd.d $vr8, $vr10, $vr12, $vr8 + vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfcvtl.d.s $vr10, $vr2 + vfcvtl.d.s $vr11, $vr6 + vfcvth.d.s $vr12, $vr2 + vfcvth.d.s $vr13, $vr6 + vfmadd.d $vr8, $vr10, $vr12, $vr8 + vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfcvtl.d.s $vr10, $vr3 + vfcvtl.d.s $vr11, $vr7 + vfcvth.d.s $vr12, $vr3 + vfcvth.d.s $vr13, $vr7 + vfmadd.d $vr8, $vr10, $vr12, $vr8 + vfmadd.d $vr9, $vr11, $vr13, $vr9 +#else + VFMADD $vr8, $vr0, $vr4, $vr8 + VFMADD $vr9, $vr1, $vr5, $vr9 + VFMADD $vr8, $vr2, $vr6, $vr8 + VFMADD $vr9, $vr3, $vr7, $vr9 +#endif + bnez I, .L11 + .align 3 +.L12: +#ifdef DOUBLE + andi I, N, 0x7 + srai.d I, I, 1 +#else + andi I, N, 0xf + srai.d I, I, 2 +#endif + bge $r0, I, .L14 /* DOUBLE: <2 ; FLOAT: <4 */ + .align 3 +.L13: + /* FLOAT: 4~15 ; DOUBLE: 2~7 */ + vld $vr0, X, 0 + vld $vr4, Y, 0 + addi.w I, I, -1 + addi.d X, X, 16 + addi.d Y, Y, 16 +#ifdef DSDOT + vfcvtl.d.s $vr10, $vr0 + vfcvtl.d.s $vr11, $vr4 + vfcvth.d.s $vr12, $vr0 + vfcvth.d.s $vr13, $vr4 + vfmadd.d $vr8, $vr10, $vr12, $vr8 + vfmadd.d $vr9, $vr11, $vr13, $vr9 +#else + VFMADD $vr8, $vr0, $vr4, $vr8 +#endif + bnez I, .L13 + .align 3 +.L14: + /* store dot in s1 $f8 */ +#ifdef DSDOT + vfadd.d $vr8, $vr8, $vr9 + fsub.s s2, s2, s2, /* set s2 to 0.0 */ + vpackod.d $vr0, $vr8, $vr8 + vfadd.d $vr8, $vr8, $vr0 +#else + VFADD $vr8, $vr8, $vr9 + SUB s2, s2, s2 /* set s2 to 0.0 */ + vpackod.d $vr0, $vr8, $vr8 +#ifdef DOUBLE + VFADD $vr8, $vr8, $vr0 +#else + VFADD $vr8, $vr8, $vr0 + vpackod.w $vr0, $vr8, $vr8 + VFADD $vr8, $vr8, $vr0 +#endif /* defined DOUBLE */ +#endif /* defined DSDOT */ + .align 3 +.L15: +#ifdef DOUBLE + andi I, N, 0x1 +#else + andi I, N, 0x3 +#endif + bge $r0, I, .L999 /* =0 */ + .align 3 +.L16: + /* DOUBLE: 1 ; FLOAT: 1~3 */ + LD a1, X, 0 + LD b1, Y, 0 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d I, I, -1 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + bnez I, .L16 + b .L999 + .align 3 + +.L20: +/* !((inc_x == 1) && (inc_y == 1)) */ + srai.d I, N, 3 +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 /* <8 */ + .align 3 + +.L23: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + blt $r0, I, .L23 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + blt $r0, I, .L26 + .align 3 + +.L999: +#ifdef DSDOT + fadd.d $f0, s1, s2 +#else + ADD $f0, s1, s2 +#endif + move $r4, $r17 + jirl $r0, $r1, 0x0 + +EPILOGUE From a1562e4baec6ead6a4c4263fb6d22d5d35f7394e Mon Sep 17 00:00:00 2001 From: "Kirill A. Korinsky" Date: Tue, 28 Nov 2023 14:04:01 +0000 Subject: [PATCH 447/718] Allow weak linking on old macOS --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c6b96f41..7e5d1e0fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,13 +258,13 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" - "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" + "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " - "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") + "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() From 08fde5ebd206d4f808069668a4020d64852da34d Mon Sep 17 00:00:00 2001 From: "Kirill A. Korinsky" Date: Thu, 30 Nov 2023 21:24:58 +0000 Subject: [PATCH 448/718] Use 64bit build on `CMAKE_SYSTEM_PROCESSOR=i386` on Darwin Here a bit tricky things. A value `CMAKE_SYSTEM_PROCESSOR` is came from output of `uname -m` which migth be 32bit with 64bit building applicaiton. So, for that case use `CMAKE_SIZEOF_VOID_P` to detect the target. See https://trac.macports.org/ticket/68488 --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 49b9863e3..c9671b379 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -46,7 +46,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") set(LOONGARCH64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*") set(RISCV64 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*")) if (NOT BINARY) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") set(X86_64 1) From 89fa51d495560b88bd2057aae4782e2ee58ce706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Br=C3=A5then=20Solem?= Date: Sun, 3 Dec 2023 19:06:49 +0100 Subject: [PATCH 449/718] Revert 42b5e08 ("Allow weak linking on old macOS") --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7203088a0..6b02864ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,13 +258,13 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" - "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" + "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " - "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") + "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() From 2381132ada5e97a576f47abff7f97f6d2a26266a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20Br=C3=A5then=20Solem?= Date: Sun, 3 Dec 2023 19:13:53 +0100 Subject: [PATCH 450/718] Darwin < 20: always write xerbla.c.o into archive Write xerbla.c.o into archive regardless of timestamp by using ar -rs instead of ar -ru. --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b02864ca..f3eac2edf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -256,14 +256,14 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) set (CMAKE_Fortran_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " - "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " - "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() From e4586e81b896b85b600c50f9670e59989cbdabf7 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Mon, 4 Dec 2023 11:02:18 +0000 Subject: [PATCH 451/718] [RISC-V] Add RISC-V Vector 128-bit target Current RVV x280 target depends on vlen=512-bits for Level 3 operations. Commit adds generic target that supports vlen=128-bits. New target uses the same scalable kernels as x280 for Level 1&2 operations, and autogenerated kernels for Level 3 operations. Functional correctness of Level 3 operations tested on vlen=128-bits using QEMU v8.1.1 for ctests and BLAS-Tester. --- Makefile.prebuild | 4 + Makefile.riscv64 | 4 + TargetList.txt | 1 + cpuid_riscv64.c | 4 +- getarch.c | 13 +- kernel/riscv64/KERNEL.RISCV64_ZVL128B | 243 +++++ kernel/riscv64/cgemm_kernel_8x4_zvl128b.c | 996 +++++++++++++++++++ kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c | 1102 +++++++++++++++++++++ kernel/riscv64/dgemm_kernel_8x4_zvl128b.c | 492 +++++++++ kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c | 660 ++++++++++++ kernel/riscv64/sgemm_kernel_8x8_zvl128b.c | 791 +++++++++++++++ kernel/riscv64/strmm_kernel_8x8_zvl128b.c | 991 ++++++++++++++++++ kernel/riscv64/zgemm_kernel_4x4_zvl128b.c | 720 ++++++++++++++ kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c | 805 +++++++++++++++ param.h | 39 + 15 files changed, 6863 insertions(+), 2 deletions(-) create mode 100644 kernel/riscv64/KERNEL.RISCV64_ZVL128B create mode 100644 kernel/riscv64/cgemm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/dgemm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c create mode 100644 kernel/riscv64/sgemm_kernel_8x8_zvl128b.c create mode 100644 kernel/riscv64/strmm_kernel_8x8_zvl128b.c create mode 100644 kernel/riscv64/zgemm_kernel_4x4_zvl128b.c create mode 100644 kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c diff --git a/Makefile.prebuild b/Makefile.prebuild index c4f4a2602..b56169da0 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -59,6 +59,10 @@ ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif +ifeq ($(TARGET), RISCV64_ZVL128B) +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d +endif + ifeq ($(TARGET), RISCV64_GENERIC) TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d endif diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce7a27141..93e270bde 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -6,6 +6,10 @@ ifeq ($(CORE), x280) CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif +ifeq ($(CORE), RISCV64_ZVL128B) +CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static +endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static diff --git a/TargetList.txt b/TargetList.txt index f76f605cc..5b7a63831 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -119,6 +119,7 @@ Z14 10.RISC-V 64: RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) +RISCV64_ZVL128B C910V x280 diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 1b6b62f21..15a539c20 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -73,11 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 #define CPU_RISCV64_ZVL256B 2 +#define CPU_RISCV64_ZVL128B 3 static char *cpuname[] = { "RISCV64_GENERIC", "C910V", - "CPU_RISCV64_ZVL256B" + "CPU_RISCV64_ZVL256B", + "CPU_RISCV64_ZVL128B" }; int detect(void){ diff --git a/getarch.c b/getarch.c index 772836347..b8b7ef7e0 100644 --- a/getarch.c +++ b/getarch.c @@ -1691,7 +1691,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "x280" #else #endif - +#ifdef FORCE_RISCV64_ZVL128B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL128B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL128B " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl128b" +#define CORENAME "RISCV64_ZVL128B" +#endif #if defined(FORCE_E2K) || defined(__e2k__) #define FORCE diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B new file mode 100644 index 000000000..fec69ee09 --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -0,0 +1,243 @@ +SAMAXKERNEL = amax_rvv.c +DAMAXKERNEL = amax_rvv.c +CAMAXKERNEL = zamax_rvv.c +ZAMAXKERNEL = zamax_rvv.c + +SAMINKERNEL = amin_rvv.c +DAMINKERNEL = amin_rvv.c +CAMINKERNEL = zamin_rvv.c +ZAMINKERNEL = zamin_rvv.c + +SMAXKERNEL = max_rvv.c +DMAXKERNEL = max_rvv.c + +SMINKERNEL = min_rvv.c +DMINKERNEL = min_rvv.c + +ISAMAXKERNEL = iamax_rvv.c +IDAMAXKERNEL = iamax_rvv.c +ICAMAXKERNEL = izamax_rvv.c +IZAMAXKERNEL = izamax_rvv.c + +ISAMINKERNEL = iamin_rvv.c +IDAMINKERNEL = iamin_rvv.c +ICAMINKERNEL = izamin_rvv.c +IZAMINKERNEL = izamin_rvv.c + +ISMAXKERNEL = imax_rvv.c +IDMAXKERNEL = imax_rvv.c + +ISMINKERNEL = imin_rvv.c +IDMINKERNEL = imin_rvv.c + +SASUMKERNEL = asum_rvv.c +DASUMKERNEL = asum_rvv.c +CASUMKERNEL = zasum_rvv.c +ZASUMKERNEL = zasum_rvv.c + +SSUMKERNEL = sum_rvv.c +DSUMKERNEL = sum_rvv.c +CSUMKERNEL = zsum_rvv.c +ZSUMKERNEL = zsum_rvv.c + +SAXPYKERNEL = axpy_rvv.c +DAXPYKERNEL = axpy_rvv.c +CAXPYKERNEL = zaxpy_rvv.c +ZAXPYKERNEL = zaxpy_rvv.c + +SAXPBYKERNEL = axpby_rvv.c +DAXPBYKERNEL = axpby_rvv.c +CAXPBYKERNEL = zaxpby_rvv.c +ZAXPBYKERNEL = zaxpby_rvv.c + +SCOPYKERNEL = copy_rvv.c +DCOPYKERNEL = copy_rvv.c +CCOPYKERNEL = zcopy_rvv.c +ZCOPYKERNEL = zcopy_rvv.c + +SDOTKERNEL = dot_rvv.c +DDOTKERNEL = dot_rvv.c +CDOTKERNEL = zdot_rvv.c +ZDOTKERNEL = zdot_rvv.c +DSDOTKERNEL = dot_rvv.c + +SNRM2KERNEL = nrm2_rvv.c +DNRM2KERNEL = nrm2_rvv.c +CNRM2KERNEL = znrm2_rvv.c +ZNRM2KERNEL = znrm2_rvv.c + +SROTKERNEL = rot_rvv.c +DROTKERNEL = rot_rvv.c +CROTKERNEL = zrot_rvv.c +ZROTKERNEL = zrot_rvv.c + +SSCALKERNEL = scal_rvv.c +DSCALKERNEL = scal_rvv.c +CSCALKERNEL = zscal_rvv.c +ZSCALKERNEL = zscal_rvv.c + +SSWAPKERNEL = swap_rvv.c +DSWAPKERNEL = swap_rvv.c +CSWAPKERNEL = zswap_rvv.c +ZSWAPKERNEL = zswap_rvv.c + +SGEMVNKERNEL = gemv_n_rvv.c +DGEMVNKERNEL = gemv_n_rvv.c +CGEMVNKERNEL = zgemv_n_rvv.c +ZGEMVNKERNEL = zgemv_n_rvv.c + +SGEMVTKERNEL = gemv_t_rvv.c +DGEMVTKERNEL = gemv_t_rvv.c +CGEMVTKERNEL = zgemv_t_rvv.c +ZGEMVTKERNEL = zgemv_t_rvv.c + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c +STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c +STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c +STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c +DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c +DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c +DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c +CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c +ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c +ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c +ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = symv_U_rvv.c +SSYMV_L_KERNEL = symv_L_rvv.c +DSYMV_U_KERNEL = symv_U_rvv.c +DSYMV_L_KERNEL = symv_L_rvv.c +CSYMV_U_KERNEL = zsymv_U_rvv.c +CSYMV_L_KERNEL = zsymv_L_rvv.c +ZSYMV_U_KERNEL = zsymv_U_rvv.c +ZSYMV_L_KERNEL = zsymv_L_rvv.c + +CHEMV_L_KERNEL = zhemv_LM_rvv.c +CHEMV_M_KERNEL = zhemv_LM_rvv.c +CHEMV_U_KERNEL = zhemv_UV_rvv.c +CHEMV_V_KERNEL = zhemv_UV_rvv.c +ZHEMV_L_KERNEL = zhemv_LM_rvv.c +ZHEMV_M_KERNEL = zhemv_LM_rvv.c +ZHEMV_U_KERNEL = zhemv_UV_rvv.c +ZHEMV_V_KERNEL = zhemv_UV_rvv.c + +SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c +SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c + +DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c +DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c + +CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c +CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + +ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c +ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + +CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c +CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + +ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c +ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = gemm_beta_rvv.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = gemm_beta_rvv.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = zgemm_beta_rvv.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = zgemm_beta_rvv.c +endif diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..bd615389c --- /dev/null +++ b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c @@ -0,0 +1,996 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='cgemm_kernel_8x4_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..3268cb810 --- /dev/null +++ b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c @@ -0,0 +1,1102 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='ctrmm_kernel_8x4_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + float B2r = B[bi + 2 * 2 + 0]; + float B2i = B[bi + 2 * 2 + 1]; + float B3r = B[bi + 3 * 2 + 0]; + float B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat32m2_t ACC2r = tmp0r; + vfloat32m2_t ACC2i = tmp0i; + vfloat32m2_t ACC3r = tmp1r; + vfloat32m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = result8 * alphar; + Ci = result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result10 * alphar; + Ci = result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = result12 * alphar; + Ci = result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = result14 * alphar; + Ci = result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + float B1r = B[bi + 1 * 2 + 0]; + float B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + vfloat32m2_t ACC1r = tmp1r; + vfloat32m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 8 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + float B0r = B[bi + 0 * 2 + 0]; + float B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat32m2_t ACC0r = tmp0r; + vfloat32m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + float Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..a613f0bce --- /dev/null +++ b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c @@ -0,0 +1,492 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=4 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=4 + VFMACC='__riscv_vfmacc_vf_f64m4' + VFMUL='__riscv_vfmul_vf_f64m4' + VLEV='__riscv_vle64_v_f64m4' + VLSEV='__riscv_vlse64_v_f64m4' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' + VSETVL='__riscv_vsetvl_e64m4' + VSEV='__riscv_vse64_v_f64m4' + VSSEV='__riscv_vsse64_v_f64m4' + acc_vector_t='vfloat64m4_t' + output='dgemm_kernel_8x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m4_t' + +*/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m4(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c new file mode 100644 index 000000000..c1e0da86e --- /dev/null +++ b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c @@ -0,0 +1,660 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=4 + M=8 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=4 + VFMACC='__riscv_vfmacc_vf_f64m4' + VFMUL='__riscv_vfmul_vf_f64m4' + VLEV='__riscv_vle64_v_f64m4' + VLSEV='__riscv_vlse64_v_f64m4' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' + VSETVL='__riscv_vsetvl_e64m4' + VSEV='__riscv_vse64_v_f64m4' + VSSEV='__riscv_vsse64_v_f64m4' + acc_vector_t='vfloat64m4_t' + output='dtrmm_kernel_8x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m4_t' + +*/ + +#include "common.h" + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m4(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + double B2 = B[bi + 2]; + double B3 = B[bi + 3]; + bi += 4; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); + vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); + vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0 = B[bi + 0]; + double B1 = B[bi + 1]; + bi += 2; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse64_v_f64m4(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m4(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e64m4(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0 = B[bi + 0]; + bi += 1; + + vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); + __riscv_vse64_v_f64m4(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c new file mode 100644 index 000000000..ad720e694 --- /dev/null +++ b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c @@ -0,0 +1,791 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='sgemm_kernel_8x8_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 8; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); + c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); + c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); + c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); + c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); + c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); + c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + result8 += A[ai + 0] * B[bi + 4]; + result9 += A[ai + 1] * B[bi + 4]; + result10 += A[ai + 0] * B[bi + 5]; + result11 += A[ai + 1] * B[bi + 5]; + result12 += A[ai + 0] * B[bi + 6]; + result13 += A[ai + 1] * B[bi + 6]; + result14 += A[ai + 0] * B[bi + 7]; + result15 += A[ai + 1] * B[bi + 7]; + ai += 2; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + C[ci + 4 * ldc + 0] += alpha * result8; + C[ci + 4 * ldc + 1] += alpha * result9; + C[ci + 5 * ldc + 0] += alpha * result10; + C[ci + 5 * ldc + 1] += alpha * result11; + C[ci + 6 * ldc + 0] += alpha * result12; + C[ci + 6 * ldc + 1] += alpha * result13; + C[ci + 7 * ldc + 0] += alpha * result14; + C[ci + 7 * ldc + 1] += alpha * result15; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + result4 += A[ai + 0] * B[bi + 4]; + result5 += A[ai + 0] * B[bi + 5]; + result6 += A[ai + 0] * B[bi + 6]; + result7 += A[ai + 0] * B[bi + 7]; + ai += 1; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + C[ci + 4 * ldc + 0] += alpha * result4; + C[ci + 5 * ldc + 0] += alpha * result5; + C[ci + 6 * ldc + 0] += alpha * result6; + C[ci + 7 * ldc + 0] += alpha * result7; + m_top += 1; + } + + n_top += 8; + } + + // -- tails for N=4 + + if (N & 4) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); + c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + C[ci + 2 * ldc + 0] += alpha * result4; + C[ci + 2 * ldc + 1] += alpha * result5; + C[ci + 3 * ldc + 0] += alpha * result6; + C[ci + 3 * ldc + 1] += alpha * result7; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + C[ci + 2 * ldc + 0] += alpha * result2; + C[ci + 3 * ldc + 0] += alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + ci += ldc - gvl * 0; + vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + C[ci + 1 * ldc + 0] += alpha * result2; + C[ci + 1 * ldc + 1] += alpha * result3; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 1 * ldc + 0] += alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); + c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + C[ci + 0 * ldc + 1] += alpha * result1; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + + for (BLASLONG k = 0; k < K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] += alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/strmm_kernel_8x8_zvl128b.c b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c new file mode 100644 index 000000000..ef18f036c --- /dev/null +++ b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c @@ -0,0 +1,991 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=8 + M_tail_scalar_from=2 + N=8 + __riscv_='__riscv_' + complex=False + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='float' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=32 + ELEN_PARAM=32 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f32m2' + VFMUL='__riscv_vfmul_vf_f32m2' + VLEV='__riscv_vle32_v_f32m2' + VLSEV='__riscv_vlse32_v_f32m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' + VSETVL='__riscv_vsetvl_e32m2' + VSEV='__riscv_vse32_v_f32m2' + VSSEV='__riscv_vsse32_v_f32m2' + acc_vector_t='vfloat32m2_t' + output='strmm_kernel_8x8_zvl128b.c' + param_scalar_t='float' + param_vector_t='vfloat32m2_t' + +*/ + +#include "common.h" + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 8; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e32m2(8); + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 8; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); + vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); + vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); + vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 8; + } + + // -- tails for main pass + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 8; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + float B4 = B[bi + 4]; + float B5 = B[bi + 5]; + float B6 = B[bi + 6]; + float B7 = B[bi + 7]; + bi += 8; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); + vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); + vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); + vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + B4 = B[bi + 4]; + B5 = B[bi + 5]; + B6 = B[bi + 6]; + B7 = B[bi + 7]; + bi += 8; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); + result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); + result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); + result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); + vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); + vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); + vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c4, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c5, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c6, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c7, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + float result8 = 0; + float result9 = 0; + float result10 = 0; + float result11 = 0; + float result12 = 0; + float result13 = 0; + float result14 = 0; + float result15 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 8; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + result8 += A[ai + 0] * B[bi + 4]; + result9 += A[ai + 1] * B[bi + 4]; + result10 += A[ai + 0] * B[bi + 5]; + result11 += A[ai + 1] * B[bi + 5]; + result12 += A[ai + 0] * B[bi + 6]; + result13 += A[ai + 1] * B[bi + 6]; + result14 += A[ai + 0] * B[bi + 7]; + result15 += A[ai + 1] * B[bi + 7]; + ai += 2; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + C[ci + 4 * ldc + 0] = alpha * result8; + C[ci + 4 * ldc + 1] = alpha * result9; + C[ci + 5 * ldc + 0] = alpha * result10; + C[ci + 5 * ldc + 1] = alpha * result11; + C[ci + 6 * ldc + 0] = alpha * result12; + C[ci + 6 * ldc + 1] = alpha * result13; + C[ci + 7 * ldc + 0] = alpha * result14; + C[ci + 7 * ldc + 1] = alpha * result15; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 8; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 8; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + result4 += A[ai + 0] * B[bi + 4]; + result5 += A[ai + 0] * B[bi + 5]; + result6 += A[ai + 0] * B[bi + 6]; + result7 += A[ai + 0] * B[bi + 7]; + ai += 1; + bi += 8; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + C[ci + 4 * ldc + 0] = alpha * result4; + C[ci + 5 * ldc + 0] = alpha * result5; + C[ci + 6 * ldc + 0] = alpha * result6; + C[ci + 7 * ldc + 0] = alpha * result7; + m_top += 1; + } + + n_top += 8; + } + + // -- tails for N=4 + + if (N & 4) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 4; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + float B2 = B[bi + 2]; + float B3 = B[bi + 3]; + bi += 4; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); + vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + B2 = B[bi + 2]; + B3 = B[bi + 3]; + bi += 4; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); + result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); + vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c2, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c3, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + float result4 = 0; + float result5 = 0; + float result6 = 0; + float result7 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + result4 += A[ai + 0] * B[bi + 2]; + result5 += A[ai + 1] * B[bi + 2]; + result6 += A[ai + 0] * B[bi + 3]; + result7 += A[ai + 1] * B[bi + 3]; + ai += 2; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + C[ci + 2 * ldc + 0] = alpha * result4; + C[ci + 2 * ldc + 1] = alpha * result5; + C[ci + 3 * ldc + 0] = alpha * result6; + C[ci + 3 * ldc + 1] = alpha * result7; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 4; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + result2 += A[ai + 0] * B[bi + 2]; + result3 += A[ai + 0] * B[bi + 3]; + ai += 1; + bi += 4; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + C[ci + 2 * ldc + 0] = alpha * result2; + C[ci + 3 * ldc + 0] = alpha * result3; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 2; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + float B0 = B[bi + 0]; + float B1 = B[bi + 1]; + bi += 2; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + B1 = B[bi + 1]; + bi += 2; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + ci += ldc - gvl * 0; + __riscv_vse32_v_f32m2(&C[ci], c1, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + float result2 = 0; + float result3 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + result2 += A[ai + 0] * B[bi + 1]; + result3 += A[ai + 1] * B[bi + 1]; + ai += 2; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + C[ci + 1 * ldc + 0] = alpha * result2; + C[ci + 1 * ldc + 1] = alpha * result3; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 0] * B[bi + 1]; + ai += 1; + bi += 2; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 1 * ldc + 0] = alpha * result1; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e32m2(8); + m_top = 0; + + for (BLASLONG i = 0; i < M / 8; i += 1) { + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 8; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 8; +#else + pass_K = off + 1; +#endif +#endif + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 8; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 8; + } + + if (M & 4) { + gvl = __riscv_vsetvl_e32m2(4); + + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + float B0 = B[bi + 0]; + bi += 1; + + vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); + + for (BLASLONG k = 1; k < pass_K; k++) { + B0 = B[bi + 0]; + bi += 1; + + A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); + ai += 4; + + result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); + __riscv_vse32_v_f32m2(&C[ci], c0, gvl); + m_top += 4; + } + + if (M & 2) { + float result0 = 0; + float result1 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + result1 += A[ai + 1] * B[bi + 0]; + ai += 2; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + C[ci + 0 * ldc + 1] = alpha * result1; + m_top += 2; + } + + if (M & 1) { + float result0 = 0; + BLASLONG ai = m_top * K; + BLASLONG bi = n_top * K; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1; + bi += off * 1; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += A[ai + 0] * B[bi + 0]; + ai += 1; + bi += 1; + } + + BLASLONG ci = n_top * ldc + m_top; + C[ci + 0 * ldc + 0] = alpha * result0; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c new file mode 100644 index 000000000..0776f03fd --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c @@ -0,0 +1,720 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='gemm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='zgemm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); + C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); + C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); + C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result8 * alphar; + Ci += result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; + Cr += result10 * alphar; + Ci += result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result12 * alphar; + Ci += result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; + Cr += result14 * alphar; + Ci += result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + ci += ldc - gvl * 0; + vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); + C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result4 * alphar; + Ci += result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; + Cr += result6 * alphar; + Ci += result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); + + C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); + C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + + ci = n_top * ldc + m_top; + + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; + Cr += result2 * alphar; + Ci += result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + + for (BLASLONG k = 0; k < K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; + Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; + Cr += result0 * alphar; + Ci += result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c new file mode 100644 index 000000000..d7d5e5fea --- /dev/null +++ b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c @@ -0,0 +1,805 @@ +/* + +AUTOGENERATED KERNEL +Script: ./kernel/riscv64/generate_kernel.py +Settings: + LMUL=2 + M=4 + M_tail_scalar_from=2 + N=4 + __riscv_='__riscv_' + complex=True + conjugate=False + cpu='zvl128b' + force_acc_double=False + index_type='BLASLONG' + op='trmm' + param_precision='double' + reg_width_bits=128 + tail_policy='' + trace=False + +Derived: + ELEN_ACC=64 + ELEN_PARAM=64 + LMUL_ACC=2 + VFMACC='__riscv_vfmacc_vf_f64m2' + VFMUL='__riscv_vfmul_vf_f64m2' + VLEV='__riscv_vle64_v_f64m2' + VLSEV='__riscv_vlse64_v_f64m2' + VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2' + VMUL_TO_ACC='__riscv_vfmul_vf_f64m2' + VSETVL='__riscv_vsetvl_e64m2' + VSEV='__riscv_vse64_v_f64m2' + VSSEV='__riscv_vsse64_v_f64m2' + acc_vector_t='vfloat64m2_t' + output='ztrmm_kernel_4x4_zvl128b.c' + param_scalar_t='double' + param_vector_t='vfloat64m2_t' + +*/ + +#include "common.h" + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define S0 1 +#define S1 -1 +#define S2 1 +#define S3 1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfmacc +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define S0 1 +#define S1 1 +#define S2 1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfmsac +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define S0 1 +#define S1 1 +#define S2 -1 +#define S3 1 +#define VFMACC_RR __riscv_vfmacc +#define VFMACC_RI __riscv_vfnmsac +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define S0 1 +#define S1 -1 +#define S2 -1 +#define S3 -1 +#define VFMACC_RR __riscv_vfmsac +#define VFMACC_RI __riscv_vfnmacc +#endif + +#if defined(LEFT) != defined(TRANSA) +#define BACKWARDS +#endif + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) + +{ + BLASLONG gvl = 0; + BLASLONG m_top = 0; + BLASLONG n_top = 0; + + // -- MAIN PASS + + for (BLASLONG j = 0; j < N / 4; j += 1) { + m_top = 0; + BLASLONG gvl = __riscv_vsetvl_e64m2(4); + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 4; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + double B2r = B[bi + 2 * 2 + 0]; + double B2i = B[bi + 2 * 2 + 1]; + double B3r = B[bi + 3 * 2 + 0]; + double B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k + // leaving 6 vector registers for temporaries + // performing 2 operations between reuses of temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + vfloat64m2_t ACC2r = tmp0r; + vfloat64m2_t ACC2i = tmp0i; + vfloat64m2_t ACC3r = tmp1r; + vfloat64m2_t ACC3i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + B2r = B[bi + 2 * 2 + 0]; + B2i = B[bi + 2 * 2 + 1]; + B3r = B[bi + 3 * 2 + 0]; + B3i = B[bi + 3 * 2 + 1]; + bi += 4 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl); + tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); + ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); + ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); + ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); + ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + vfloat64m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl); + vfloat64m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl); + vfloat64m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl); + vfloat64m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); + C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); + C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); + C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); + + m_top += 4; + } + + // -- tails for main pass + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + double result8 = 0; + double result9 = 0; + double result10 = 0; + double result11 = 0; + double result12 = 0; + double result13 = 0; + double result14 = 0; + double result15 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; + result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; + result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; + result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; + ai += 2 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + Cr = result8 * alphar; + Ci = result9 * alphar; + Cr -= result9 * alphai; + Ci += result8 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result10 * alphar; + Ci = result11 * alphar; + Cr -= result11 * alphai; + Ci += result10 * alphai; + C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; + Cr = result12 * alphar; + Ci = result13 * alphar; + Cr -= result13 * alphai; + Ci += result12 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + Cr = result14 * alphar; + Ci = result15 * alphar; + Cr -= result15 * alphai; + Ci += result14 * alphai; + C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 4 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 4; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; + result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; + result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; + ai += 1 * 2; + bi += 4 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 4; + } + + // -- tails for N=2 + + if (N & 2) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 2; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + double B1r = B[bi + 1 * 2 + 0]; + double B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k + // leaving 10 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + vfloat64m2_t ACC1r = tmp1r; + vfloat64m2_t ACC1i = tmp1i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + B1r = B[bi + 1 * 2 + 0]; + B1i = B[bi + 1 * 2 + 1]; + bi += 2 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl); + tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); + tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); + ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl); + vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); + C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + ci += ldc - gvl * 0; + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + double result4 = 0; + double result5 = 0; + double result6 = 0; + double result7 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; + result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; + ai += 2 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + Cr = result4 * alphar; + Ci = result5 * alphar; + Cr -= result5 * alphai; + Ci += result4 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + Cr = result6 * alphar; + Ci = result7 * alphar; + Cr -= result7 * alphai; + Ci += result6 * alphai; + C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 2 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 2; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; + result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; + ai += 1 * 2; + bi += 2 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 2; + } + + // -- tails for N=1 + + if (N & 1) { + gvl = __riscv_vsetvl_e64m2(4); + m_top = 0; + + for (BLASLONG i = 0; i < M / 4; i += 1) { + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 4 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 4; +#else + pass_K = off + 1; +#endif +#endif + double B0r = B[bi + 0 * 2 + 0]; + double B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k + // leaving 12 vector registers for temporaries + vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + vfloat64m2_t ACC0r = tmp0r; + vfloat64m2_t ACC0i = tmp0i; + + for (BLASLONG k = 1; k < pass_K; k++) { + B0r = B[bi + 0 * 2 + 0]; + B0i = B[bi + 0 * 2 + 1]; + bi += 1 * 2; + + A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); + A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); + ai += 4 * 2; + + tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl); + tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl); + tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); + tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); + ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); + ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); + } + + BLASLONG ci = n_top * ldc + m_top; + + vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl); + vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl); + C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); + C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); + __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); + + m_top += 4; + } + + if (M & 2) { + double result0 = 0; + double result1 = 0; + double result2 = 0; + double result3 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 2 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 2; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; + result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; + ai += 2 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + Cr = result2 * alphar; + Ci = result3 * alphar; + Cr -= result3 * alphai; + Ci += result2 * alphai; + C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; + m_top += 2; + } + + if (M & 1) { + double result0 = 0; + double result1 = 0; + BLASLONG ai = m_top * K * 2; + BLASLONG bi = n_top * K * 2; + BLASLONG pass_K = K; +#ifdef LEFT + BLASLONG off = offset + m_top; +#else + BLASLONG off = -offset + n_top; +#endif +#ifdef BACKWARDS + ai += off * 1 * 2; + bi += off * 1 * 2; + pass_K -= off; +#else +#ifdef LEFT + pass_K = off + 1; +#else + pass_K = off + 1; +#endif +#endif + + for (BLASLONG k = 0; k < pass_K; k++) { + result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; + result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; + ai += 1 * 2; + bi += 1 * 2; + } + + BLASLONG ci = n_top * ldc + m_top; + double Cr, Ci; + Cr = result0 * alphar; + Ci = result1 * alphar; + Cr -= result1 * alphai; + Ci += result0 * alphai; + C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; + C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; + m_top += 1; + } + + n_top += 1; + } + + return 0; +} diff --git a/param.h b/param.h index c5c70b78e..a1a70400c 100644 --- a/param.h +++ b/param.h @@ -3123,6 +3123,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef RISCV64_ZVL128B +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + #ifdef RISCV64_ZVL256B #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 From 4a12cf53ec116c06e5d74073b54a3bca6046cb17 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Mon, 4 Dec 2023 11:13:35 +0000 Subject: [PATCH 452/718] [RISC-V] Improve RVV kernel generator LMUL usage The RVV kernel generation script uses the provided LMUL to increase the number of accumulator registers. Since the effect of the LMUL is to group together the vector registers into larger ones, it actually should be used as a multiplier in the calculation of vlenmax. At the moment, no matter what LMUL is provided, the generated kernels would only set the maximum number of vector elements equal to VLEN/SEW. Commit changes the use of LMUL to properly adjust vlenmax. Note that an increase in LMUL results in a decrease in the number of effective vector registers. --- kernel/riscv64/generate_kernel.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/riscv64/generate_kernel.py b/kernel/riscv64/generate_kernel.py index e2ce97971..8be7c9f9c 100755 --- a/kernel/riscv64/generate_kernel.py +++ b/kernel/riscv64/generate_kernel.py @@ -197,13 +197,13 @@ def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ): dest.write("ai += {M}*2;") dest.write() - - accumulation_regs = a_regs * N * settings['LMUL_ACC'].value + # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results + accumulation_regs = a_regs * N dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k", a_regs=a_regs*2, accumulation_regs=accumulation_regs*2 ) pass_regs = (accumulation_regs + a_regs)*2 - tmp_regs = 32-pass_regs + tmp_regs = (32 // settings['LMUL_ACC'].value) - pass_regs if tmp_regs < 2: raise RuntimeError("Complex kernel would use too many registers!") @@ -337,10 +337,12 @@ def generate_gemm_kernel( settings, OUTPUT ): M = settings['M'].value N = settings['N'].value - vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) + vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value / + settings['ELEN_PARAM'].value) a_regs = max(int(M/vlenmax), 1) - accumulation_regs = a_regs * N * settings['LMUL_ACC'].value + # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results + accumulation_regs = a_regs * N required_regs = accumulation_regs + a_regs if is_complex: required_regs = required_regs * 2 + 2 @@ -380,9 +382,9 @@ def generate_gemm_kernel( settings, OUTPUT ): '''.format(tail_policy=settings['tail_policy'].value)) - if required_regs > 32: - raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format( - required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '') + if required_regs > (32 // settings['LMUL_ACC'].value): + raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only {} are available".format( + required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else ''), 32 // settings['LMUL_ACC'].value )) TRMM = (settings['op'].value == 'trmm') @@ -448,7 +450,8 @@ def generate_gemm_kernel( settings, OUTPUT ): def generate_M_tails( dest, settings, M, N ): M_tail = int(M/2) M_tail_min = settings['M_tail_scalar_from'].value - vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value ) + vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value + / settings['ELEN_PARAM'].value ) TRMM = (settings['op'].value == 'trmm') is_complex = settings['complex'].value generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real @@ -667,4 +670,4 @@ def main(): ERROR("unsupported kernel type {}".format(settings['op'])) if __name__ == "__main__": - main() \ No newline at end of file + main() From a53a79e059545e2935e2745f41cf3498277f1a1e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Dec 2023 15:41:39 +0100 Subject: [PATCH 453/718] Add tests for the DMD functions (Reference-LAPACK PR 736) --- lapack-netlib/TESTING/EIG/Makefile | 58 +- lapack-netlib/TESTING/EIG/cchkdmd.f90 | 721 +++++++++++++++++++++++ lapack-netlib/TESTING/EIG/dchkdmd.f90 | 813 ++++++++++++++++++++++++++ lapack-netlib/TESTING/EIG/schkdmd.f90 | 792 +++++++++++++++++++++++++ lapack-netlib/TESTING/EIG/zchkdmd.f90 | 745 +++++++++++++++++++++++ 5 files changed, 3116 insertions(+), 13 deletions(-) create mode 100644 lapack-netlib/TESTING/EIG/cchkdmd.f90 create mode 100644 lapack-netlib/TESTING/EIG/dchkdmd.f90 create mode 100644 lapack-netlib/TESTING/EIG/schkdmd.f90 create mode 100644 lapack-netlib/TESTING/EIG/zchkdmd.f90 diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index 942ae6982..5de315b6e 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -64,6 +64,8 @@ SEIGTST = schkee.o \ sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \ sstt22.o ssyl01.o ssyt21.o ssyt22.o +SDMDEIGTST = schkdmd.o + CEIGTST = cchkee.o \ cbdt01.o cbdt02.o cbdt03.o cbdt05.o \ cchkbb.o cchkbd.o cchkbk.o cchkbl.o cchkec.o \ @@ -81,6 +83,8 @@ CEIGTST = cchkee.o \ csgt01.o cslect.o csyl01.o\ cstt21.o cstt22.o cunt01.o cunt03.o +CDMDEIGTST = cchkdmd.o + DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \ dsvdch.o dsvdct.o dsxt1.o @@ -101,6 +105,8 @@ DEIGTST = dchkee.o \ dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \ dstt22.o dsyl01.o dsyt21.o dsyt22.o +DDMDEIGTST = dchkdmd.o + ZEIGTST = zchkee.o \ zbdt01.o zbdt02.o zbdt03.o zbdt05.o \ zchkbb.o zchkbd.o zchkbk.o zchkbl.o zchkec.o \ @@ -118,27 +124,45 @@ ZEIGTST = zchkee.o \ zsgt01.o zslect.o zsyl01.o\ zstt21.o zstt22.o zunt01.o zunt03.o +ZDMDEIGTST = zchkdmd.o + .PHONY: all all: single complex double complex16 .PHONY: single complex double complex16 -single: xeigtsts -complex: xeigtstc -double: xeigtstd -complex16: xeigtstz +single: xeigtsts xdmdeigtsts +complex: xeigtstc xdmdeigtstc +double: xeigtstd xdmdeigtstd +complex16: xeigtstz xdmdeigtstz + +xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + +xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ +xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + +xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) + $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + +$(SDMDEIGTST): $(FRC) +$(CDMDEIGTST): $(FRC) +$(DDMDEIGTST): $(FRC) +$(ZDMDEIGTST): $(FRC) $(AEIGTST): $(FRC) $(SCIGTST): $(FRC) $(DZIGTST): $(FRC) @@ -155,7 +179,7 @@ clean: cleanobj cleanexe cleanobj: rm -f *.o cleanexe: - rm -f xeigtst* + rm -f xeigtst* xdmdeigtst* schkee.o: schkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< @@ -165,3 +189,11 @@ cchkee.o: cchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< zchkee.o: zchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< +schkdmd.o: schkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< +cchkdmd.o: cchkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< +dchkdmd.o: dchkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< +zchkdmd.o: zchkdmd.f90 + $(FC) $(FFLAGS_DRV) -c -o $@ $< diff --git a/lapack-netlib/TESTING/EIG/cchkdmd.f90 b/lapack-netlib/TESTING/EIG/cchkdmd.f90 new file mode 100644 index 000000000..a9c181da9 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/cchkdmd.f90 @@ -0,0 +1,721 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! CGEDMD, for computation of the +! Dynamic Mode Decomposition (DMD) +! CGEDMDQ, for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!............................................................ + +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + + use iso_fortran_env + IMPLICIT NONE + integer, parameter :: WP = real32 +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + + COMPLEX(KIND=WP), PARAMETER :: CONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: CZERO = ( 0.0_WP, 0.0_WP ) +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: RES, & + RES1, RESEX, SINGVX, SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: WDUMMY(2) + INTEGER :: IDUMMY(4), ISEED(4) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_XW, & + TMP_EX +!............................................................ + COMPLEX(KIND=WP) :: CMAX + INTEGER :: LCWORK + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: A, AC, & + AU, F, F0, F1, S, W, & + X, X0, Y, Y0, Y1, Z, Z1 + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:) :: CDA, CDR, & + CDL, CEIGS, CEIGSA, CWORK + COMPLEX(KIND=WP) :: CDUMMY(22), CDUM2X2(2,2) +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD + INTEGER :: iNRNK, iWHTSVD, K_traj, LWMINOPT + CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + LOGICAL :: TEST_QRDMD + +!..... external subroutines (BLAS and LAPACK) + EXTERNAL CAXPY, CGEEV, CGEMM, CGEMV, CLASCL +!.....external subroutines DMD package +! subroutines under test + EXTERNAL CGEDMD, CGEDMDQ +!..... external functions (BLAS and LAPACK) + EXTERNAL SCNRM2, SLAMCH + REAL(KIND=WP) :: SCNRM2, SLAMCH + EXTERNAL CLANGE + REAL(KIND=WP) :: CLANGE + EXTERNAL ICAMAX + INTEGER ICAMAX + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX, SIGN +!............................................................ + + + WRITE(*,*) 'COMPLEX CODE TESTING' + + ! The test is always in pairs : ( CGEDMD and CGEDMDQ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + ! This code by default performs tests on CGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = SLAMCH( 'P' ) ! machine precision WP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + READ(*,*) M + WRITE(*,*) 'M = ', M + ! ... and the number of snapshots. + READ(*,*) N + WRITE(*,*) 'N = ', N + + ! Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = M + LDY = M + LDW = N + LDZ = M + LDAU = M + LDS = N + + TMP_XW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ALLOCATE( A(LDA,M) ) + ALLOCATE( AC(LDA,M) ) + ALLOCATE( F(LDF,N+1) ) + ALLOCATE( F0(LDF,N+1) ) + ALLOCATE( F1(LDF,N+1) ) + ALLOCATE( X(LDX,N) ) + ALLOCATE( X0(LDX,N) ) + ALLOCATE( Y(LDY,N+1) ) + ALLOCATE( Y0(LDY,N+1) ) + ALLOCATE( Y1(LDY,N+1) ) + ALLOCATE( AU(LDAU,N) ) + ALLOCATE( W(LDW,N) ) + ALLOCATE( S(LDS,N) ) + ALLOCATE( Z(LDZ,N) ) + ALLOCATE( Z1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( CEIGS(N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + + TOL = 10*M*EPS + TOL2 = 10*M*N*EPS + +!............. + + DO K_traj = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D4 + CMAX = (1.0D1,1.0D1) + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D1 + MODER = 6 + CONDR = 1.0D1 + PIVTNG = 'N' + ! Loop over all parameter MODE values for CLATMR (+-1,..,+-6) + + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE( CDA(M) ) + ALLOCATE( CDL(M) ) + ALLOCATE( CDR(M) ) + + CALL CLATMR( M, M, 'N', ISEED, 'N', CDA, MODE, COND, & + CMAX, RSIGN, GRADE, CDL, MODEL, CONDL, & + CDR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO ) + DEALLOCATE( CDR ) + DEALLOCATE( CDL ) + DEALLOCATE( CDA ) + DEALLOCATE( IWORK ) + + LCWORK = MAX(1,2*M) + ALLOCATE( CEIGSA(M) ) + ALLOCATE( CWORK(LCWORK) ) + ALLOCATE( WORK(2*M) ) + AC(1:M,1:M) = A(1:M,1:M) + CALL CGEEV( 'N','N', M, AC, LDA, CEIGSA, CDUM2X2, 2, & + CDUM2X2, 2, CWORK, LCWORK, WORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + DEALLOCATE(CWORK) + + TMP = ABS(CEIGSA(ICAMAX(M, CEIGSA, 1))) ! The spectral radius of A + ! Scale the matrix A to have unit spectral radius. + CALL CLASCL( 'G',0, 0, TMP, ONE, M, M, & + A, LDA, INFO ) + CALL CLASCL( 'G',0, 0, TMP, ONE, M, 1, & + CEIGSA, M, INFO ) + ANORM = CLANGE( 'F', M, M, A, LDA, WDUMMY ) + + IF ( K_traj == 2 ) THEN + ! generate data as two trajectories + ! with two inital conditions + CALL CLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N/2 + CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1, & + CZERO, F(1,i+1), 1 ) + END DO + X0(1:M,1:N/2) = F(1:M,1:N/2) + Y0(1:M,1:N/2) = F(1:M,2:N/2+1) + + CALL CLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N-N/2 + CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1, & + CZERO, F(1,i+1), 1 ) + END DO + X0(1:M,N/2+1:N) = F(1:M,1:N-N/2) + Y0(1:M,N/2+1:N) = F(1:M,2:N-N/2+1) + ELSE + CALL CLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N + CALL CGEMV( 'N', M, M, CONE, A, M, F(1,i), 1, & + CZERO, F(1,i+1), 1 ) + END DO + F0(1:M,1:N+1) = F(1:M,1:N+1) + X0(1:M,1:N) = F0(1:M,1:N) + Y0(1:M,1:N) = F0(1:M,2:N+1) + END IF + + DEALLOCATE( CEIGSA ) +!........................................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' + RESIDS = 'R' + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' + CASE(2) + JOBREF = 'E' + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' + CASE(2) + SCALE = 'C' + CASE(3) + SCALE = 'Y' + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + NRNK = iNRNK + + DO iWHTSVD = 1, 3 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + ! CGEDMD is always tested and its results are also used for + ! comparisons with CGEDMDQ. + + X(1:M,1:N) = X0(1:M,1:N) + Y(1:M,1:N) = Y0(1:M,1:N) + + CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, CEIGS, Z, LDZ, RES, & + AU, LDAU, W, LDW, S, LDS, & + CDUMMY, -1, WDUMMY, -1, IDUMMY, -1, INFO ) + + IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) & + .OR. ( INFO < 0 ) ) THEN + WRITE(*,*) 'Call to CGEDMD workspace query failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ', & + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS + STOP + ELSE + !WRITE(*,*) '... done. Workspace length computed.' + END IF + + LCWORK = INT(CDUMMY(LWMINOPT)) + ALLOCATE(CWORK(LCWORK)) + LIWORK = IDUMMY(1) + ALLOCATE(IWORK(LIWORK)) + LWORK = INT(WDUMMY(1)) + ALLOCATE(WORK(LWORK)) + + CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, CEIGS, Z, LDZ, RES, & + AU, LDAU, W, LDW, S, LDS, & + CWORK, LCWORK, WORK, LWORK, IWORK, LIWORK, INFO ) + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to CGEDMD failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + SINGVX(1:N) = WORK(1:N) + + !...... CGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from CGEDMD + ! This checks that the returned eigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the Rayleigh quotient + ! returned in W + CALL CGEMM( 'N', 'N', M, K, K, CONE, X, LDX, W, LDW, & + CZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL CAXPY( M, -CONE, Z(1,i), 1, Z1(1,i), 1) + TMP = MAX(TMP, SCNRM2( M, Z1(1,i), 1 ) ) + END DO + TMP_XW = MAX(TMP_XW, TMP ) + IF ( TMP_XW <= TOL ) THEN + !WRITE(*,*) ' :) .... OK .........CGEDMD PASSED.' + ELSE + NFAIL_Z_XV = NFAIL_Z_XV + 1 + WRITE(*,*) ':( .................CGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + END IF + !...... CGEDMD check point + + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, X, LDX, & + CZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL CAXPY( M, -CONE, AU(1,i), 1, Z1(1,i), 1) + TMP = MAX( TMP, SCNRM2( M, Z1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK .........CGEDMD PASSED.' + ELSE + NFAIL_AU = NFAIL_AU + 1 + WRITE(*,*) ':( .................CGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL2 + END IF + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, AU, LDAU, CZERO, Y1, LDY ) + + DO i=1, K + CALL CAXPY( M, -CEIGS(i), AU(1,i), 1, Y1(1,i), 1 ) + RESEX(i) = SCNRM2( M, Y1(1,i), 1) / SCNRM2(M,AU(1,i),1) + END DO + END IF + !...... CGEDMD check point + + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by CGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in CGEDMD,) + + DO i=1, K + ! have a real eigenvalue with real eigenvector + CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 ) + RES1(i) = SCNRM2( M, Y1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK ..........CGEDMD PASSED.' + ELSE + NFAIL_REZ = NFAIL_REZ + 1 + WRITE(*,*) ':( ..................CGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + DEALLOCATE(CWORK) + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + +!....................................................................................................... + + IF ( K_traj == 1 ) THEN + + F(1:M,1:N+1) = F0(1:M,1:N+1) + CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, F, LDF, X, LDX, Y, LDY, & + NRNK, TOL, K, CEIGS, Z, LDZ, RES, AU, & + LDAU, W, LDW, S, LDS, CDUMMY, -1, & + WDUMMY, -1, IDUMMY, -1, INFO ) + + LCWORK = INT(CDUMMY(LWMINOPT)) + ALLOCATE(CWORK(LCWORK)) + LIWORK = IDUMMY(1) + ALLOCATE(IWORK(LIWORK)) + LWORK = INT(WDUMMY(1)) + ALLOCATE(WORK(LWORK)) + + CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, F, LDF, X, LDX, Y, LDY, & + NRNK, TOL, KQ, CEIGS, Z, LDZ, RES, AU, & + LDAU, W, LDW, S, LDS, CWORK, LCWORK, & + WORK, LWORK, IWORK, LIWORK, INFO ) + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to CGEDMDQ failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + SINGVQX(1:N) =WORK(1:N) + + !..... ZGEDMDQ check point + + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > TOL2 ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + END IF + !..... CGEDMDQ check point + + !..... CGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + F1(1:M,1:N+1) = F0(1:M,1:N+1) + CALL CGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -CONE, F, & + LDF, Y, LDY, CONE, F1, LDF ) + TMP_FQR = CLANGE( 'F', M, N+1, F1, LDF, WORK ) / & + CLANGE( 'F', M, N+1, F0, LDF, WORK ) + IF ( TMP_FQR <= TOL2 ) THEN + !WRITE(*,*) ':) CGEDMDQ ........ PASSED.' + ELSE + WRITE(*,*) ':( CGEDMDQ ........ FAILED.' + NFAIL_F_QR = NFAIL_F_QR + 1 + END IF + END IF + !..... ZGEDMDQ checkpoint + !..... ZGEDMDQ checkpoint + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by ZGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL CGEMM( 'N', 'N', M, KQ, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in ZGEDMDQ) + DO i = 1, KQ + ! have a real eigenvalue with real eigenvector + CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = SCNRM2( M, Y1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(KQ)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) '.... OK ........ CGEDMDQ PASSED.' + ELSE + NFAIL_REZQ = NFAIL_REZQ + 1 + WRITE(*,*) '................ CGEDMDQ FAILED!', & + 'Check the code for implementation errors.' + END IF + END IF + + DEALLOCATE(CWORK) + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + + END IF + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! iWHTSVD + !write(*,*) 'WHTSVD loop completed' + END DO ! iNRNK -2:-1 + !write(*,*) 'NRNK loop completed' + END DO ! iSCALE 1:4 + !write(*,*) 'SCALE loop completed' + END DO + !write(*,*) 'JOBREF loop completed' + END DO ! iJOBZ + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE( A ) + DEALLOCATE( AC ) + DEALLOCATE( Z ) + DEALLOCATE( F ) + DEALLOCATE( F0 ) + DEALLOCATE( F1 ) + DEALLOCATE( X ) + DEALLOCATE( X0 ) + DEALLOCATE( Y ) + DEALLOCATE( Y0 ) + DEALLOCATE( Y1 ) + DEALLOCATE( AU ) + DEALLOCATE( W ) + DEALLOCATE( S ) + DEALLOCATE( Z1 ) + DEALLOCATE( RES ) + DEALLOCATE( RES1 ) + DEALLOCATE( RESEX ) + DEALLOCATE( CEIGS ) + DEALLOCATE( SINGVX ) + DEALLOCATE( SINGVQX ) + + END DO ! LLOOP + + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for CGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_XW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_z_XV + END IF + + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> CGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> CGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for CGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> CGEDMD and CGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> CGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> CGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/dchkdmd.f90 b/lapack-netlib/TESTING/EIG/dchkdmd.f90 new file mode 100644 index 000000000..4fbf7531b --- /dev/null +++ b/lapack-netlib/TESTING/EIG/dchkdmd.f90 @@ -0,0 +1,813 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! DGEDMD for computation of the +! Dynamic Mode Decomposition (DMD) +! DGEDMDQ for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!... ......................................................... +! NOTE: +! When using the Intel MKL 2022.0.3 the subroutine xGESVDQ +! (optionally used in xGEDMD) may cause access violation +! error for x = S, D, C, Z, but only if called with the +! work space query. (At least in our Windows 10 MSVS 2019.) +! The problem can be mitigated by downloading the source +! code of xGESVDQ from the LAPACK repository and use it +! localy instead of the one in the MKL. This seems to +! indicate that the problem is indeed in the MKL. +! This problem did not appear whith Intel MKL 2022.2.0. +! +! NOTE: +! xGESDD seems to have a problem with workspace. In some +! cases the length of the optimal workspace is returned +! smaller than the minimal workspace, as specified in the +! code. As a precaution, all optimal workspaces are +! set as MAX(minimal, optimal). +! Latest implementations of complex xGESDD have different +! length of the real worksapce. We use max value over +! two versions. +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + use iso_fortran_env, only: real64 + IMPLICIT NONE + integer, parameter :: WP = real64 + +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: & + A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,& + Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1 + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: & + DA, DL, DR, REIG, REIGA, REIGQ, IEIG, & + IEIGA, IEIGQ, RES, RES1, RESEX, SINGVX,& + SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: AB(2,2), WDUMMY(2) + INTEGER :: IDUMMY(2), ISEED(4), RJOBDATA(8) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_ZXW, & + TMP_EX, XNORM, YNORM +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, KDIFF, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD + INTEGER iNRNK, iWHTSVD, K_TRAJ, LWMINOPT + CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + + LOGICAL TEST_QRDMD +!..... external subroutines (BLAS and LAPACK) + EXTERNAL DAXPY, DGEEV, DGEMM, DGEMV, DLACPY, DLASCL + EXTERNAL DLARNV, DLATMR +!.....external subroutines DMD package, part 1 +! subroutines under test + EXTERNAL DGEDMD, DGEDMDQ + +!..... external functions (BLAS and LAPACK) + EXTERNAL DLAMCH, DLANGE, DNRM2 + REAL(KIND=WP) :: DLAMCH, DLANGE, DNRM2 + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX +!............................................................ + + ! The test is always in pairs : ( DGEDMD and DGEDMDQ ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + TEST_QRDMD = .TRUE. ! This code by default performs tests on DGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = DLAMCH( 'P' ) ! machine precision DP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + KDIFF = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + WRITE(*,*) 'M = ' + READ(*,*) M + WRITE(*,*) M + ! ... and the number of snapshots. + WRITE(*,*) 'N = ' + READ(*,*) N + WRITE(*,*) N + + ! ... Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = MAX(M,N+1) + LDY = MAX(M,N+1) + LDW = N + LDZ = M + LDAU = MAX(M,N+1) + LDS = N + + TMP_ZXW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ! + ! Test the subroutines on real data snapshots. All + ! computation is done in real arithmetic, even when + ! Koopman eigenvalues and modes are real. + ! + ! Allocate memory space + ALLOCATE( A(LDA,M) ) + ALLOCATE( AC(LDA,M) ) + ALLOCATE( DA(M) ) + ALLOCATE( DL(M) ) + ALLOCATE( F(LDF,N+1) ) + ALLOCATE( F1(LDF,N+1) ) + ALLOCATE( F2(LDF,N+1) ) + ALLOCATE( X(LDX,N) ) + ALLOCATE( X0(LDX,N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + ALLOCATE( Y(LDY,N+1) ) + ALLOCATE( Y0(LDY,N+1) ) + ALLOCATE( Y1(M,N+1) ) + ALLOCATE( Z(LDZ,N) ) + ALLOCATE( Z1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( REIG(N) ) + ALLOCATE( IEIG(N) ) + ALLOCATE( REIGQ(N) ) + ALLOCATE( IEIGQ(N) ) + ALLOCATE( REIGA(M) ) + ALLOCATE( IEIGA(M) ) + ALLOCATE( VA(LDA,M) ) + ALLOCATE( LAMBDA(N,2) ) + ALLOCATE( LAMBDAQ(N,2) ) + ALLOCATE( EIGA(M,2) ) + ALLOCATE( W(LDW,N) ) + ALLOCATE( AU(LDAU,N) ) + ALLOCATE( S(N,N) ) + + TOL = M*EPS + ! This mimics O(M*N)*EPS bound for accumulated roundoff error. + ! The factor 10 is somewhat arbitrary. + TOL2 = 10*M*N*EPS + +!............. + + DO K_TRAJ = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D8 + DMAX = 1.0D2 + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D2 + MODER = 6 + CONDR = 1.0D2 + PIVTNG = 'N' + + ! Loop over all parameter MODE values for ZLATMR (+1,..,+6) + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE(DR(N)) + CALL DLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, & + DMAX, RSIGN, GRADE, DL, MODEL, CONDL, & + DR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO ) + DEALLOCATE(IWORK) + DEALLOCATE(DR) + + LWORK = 4*M+1 + ALLOCATE(WORK(LWORK)) + AC = A + CALL DGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, & + VA, M, WORK, LWORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + TMP = ZERO + DO i = 1, M + EIGA(i,1) = REIGA(i) + EIGA(i,2) = IEIGA(i) + TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2)) + END DO + + ! Scale A to have the desirable spectral radius. + CALL DLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO ) + CALL DLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO ) + + ! Compute the norm of A + ANORM = DLANGE( 'F', N, N, A, M, WDUMMY ) + + IF ( K_TRAJ == 2 ) THEN + ! generate data with two inital conditions + CALL DLARNV(2, ISEED, M, F1(1,1) ) + F1(1:M,1) = 1.0E-10*F1(1:M,1) + DO i = 1, N/2 + CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,1:N/2) = F1(1:M,1:N/2) + Y0(1:M,1:N/2) = F1(1:M,2:N/2+1) + + CALL DLARNV(2, ISEED, M, F1(1,1) ) + DO i = 1, N-N/2 + CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2) + Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1) + ELSE + CALL DLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N + CALL DGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, & + F(1,i+1), 1 ) + END DO + X0(1:M,1:N) = F(1:M,1:N) + Y0(1:M,1:N) = F(1:M,2:N+1) + END IF + + XNORM = DLANGE( 'F', M, N, X0, LDX, WDUMMY ) + YNORM = DLANGE( 'F', M, N, Y0, LDX, WDUMMY ) +!............................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' ! Ritz vectors will be computed + RESIDS = 'R' ! Residuals will be computed + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' ! Ritz vectors in factored form + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' ! Data for refined Ritz vectors + CASE(2) + JOBREF = 'E' ! Exact DMD vectors + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' ! X data normalized + CASE(2) + SCALE = 'C' ! X normalized, consist. check + CASE(3) + SCALE = 'Y' ! Y data normalized + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + ! Two truncation strategies. The "-2" case for R&D + ! purposes only - it uses possibly low accuracy small + ! singular values, in which case the formulas used in + ! the DMD are highly sensitive. + NRNK = iNRNK + + DO iWHTSVD = 1, 4 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + X(1:M,1:N) = X0(1:M,1:N) + Y(1:M,1:N) = Y0(1:M,1:N) + + ! DGEDMD: Workspace query and workspace allocation + CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, & + IDUMMY, -1, INFO ) + + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE( WORK(LWORK) ) + + ! DGEDMD test: CALL DGEDMD + CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,& + IWORK, LIWORK, INFO ) + + SINGVX(1:N) = WORK(1:N) + + !...... DGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from DGEDMD + ! This checks that the returned aigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the rayleigh quotient + ! returned in W + CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL DAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1) + TMP = MAX(TMP, DNRM2( M, Z1(1,i), 1 ) ) + END DO + TMP_ZXW = MAX(TMP_ZXW, TMP ) + + IF ( TMP_ZXW > 10*M*EPS ) THEN + NFAIL_Z_XV = NFAIL_Z_XV + 1 + WRITE(*,*) ':( .................DGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + END IF + + !...... DGEDMD check point + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL DAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1) + TMP = MAX( TMP, DNRM2( M, Z1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_AU = NFAIL_AU + 1 + WRITE(*,*) ':( .................DGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + + CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M ) + i=1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 ) + RESEX(i) = DNRM2( M, Y1(1,i), 1) / DNRM2(M,AU(1,i),1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RESEX(i) = DLANGE( 'F', M, 2, Y1(1,i), M, & + WORK )/ DLANGE( 'F', M, 2, AU(1,i), M, & + WORK ) + RESEX(i+1) = RESEX(i) + i = i + 2 + END IF + END DO + + END IF + + !...... DGEDMD check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by DGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in DGEDMD,) + i = 1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 ) + RES1(i) = DNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RES1(i) = DLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_REZ = NFAIL_REZ + 1 + WRITE(*,*) ':( ..................DGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + !..... store the results for inspection + DO i = 1, K + LAMBDA(i,1) = REIG(i) + LAMBDA(i,2) = IEIG(i) + END DO + + DEALLOCATE(IWORK) + DEALLOCATE(WORK) + + !====================================================================== + ! Now test the DGEDMDQ + !====================================================================== + IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN + RJOBDATA(2) = 1 + F1 = F + + ! DGEDMDQ test: Workspace query and workspace allocation + CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, & + -1, IDUMMY, -1, INFO ) + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE(WORK(LWORK)) + ! DGEDMDQ test: CALL DGEDMDQ + CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ) + + !..... DGEDMDQ check point + IF ( KQ /= K ) THEN + KDIFF = KDIFF+1 + END IF + + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > M*N*EPS ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + DO j =1, 3 + write(*,*) j, SINGVX(j), SINGVQX(j) + read(*,*) + END DO + END IF + + !..... DGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + F2 = F + CALL DGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, & + LDF, Y, LDY, ONE, F2, LDF ) + TMP_FQR = DLANGE( 'F', M, N+1, F2, LDF, WORK ) / & + DLANGE( 'F', M, N+1, F, LDF, WORK ) + IF ( TMP_FQR > TOL2 ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_F_QR = NFAIL_F_QR + 1 + END IF + END IF + + !..... DGEDMDQ check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by DGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL DGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in DGEDMDQ) + i = 1 + DO WHILE ( i <= KQ ) + IF ( IEIGQ(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL DAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = DNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIGQ(i) + AB(2,1) = -IEIGQ(i) + AB(1,2) = IEIGQ(i) + AB(2,2) = REIGQ(i) + CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) ! BLAS CALL + ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INTRINSIC + RES1(i) = DLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) ! LAPACK CALL + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(K)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP > TOL2 ) THEN + NFAIL_REZQ = NFAIL_REZQ + 1 + WRITE(*,*) '................ DGEDMDQ FAILED!', & + 'Check the code for implementation errors.' + STOP + END IF + + END IF + + DO i = 1, KQ + LAMBDAQ(i,1) = REIGQ(i) + LAMBDAQ(i,2) = IEIGQ(i) + END DO + + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + END IF ! TEST_QRDMD +!====================================================================== + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! WHTSVD LOOP + !write(*,*) 'WHTSVD loop completed' + END DO ! NRNK LOOP + !write(*,*) 'NRNK loop completed' + END DO ! SCALE LOOP + !write(*,*) 'SCALE loop completed' + END DO ! JOBF LOOP + !write(*,*) 'JOBREF loop completed' + END DO ! JOBZ LOOP + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE(A) + DEALLOCATE(AC) + DEALLOCATE(DA) + DEALLOCATE(DL) + DEALLOCATE(F) + DEALLOCATE(F1) + DEALLOCATE(F2) + DEALLOCATE(X) + DEALLOCATE(X0) + DEALLOCATE(SINGVX) + DEALLOCATE(SINGVQX) + DEALLOCATE(Y) + DEALLOCATE(Y0) + DEALLOCATE(Y1) + DEALLOCATE(Z) + DEALLOCATE(Z1) + DEALLOCATE(RES) + DEALLOCATE(RES1) + DEALLOCATE(RESEX) + DEALLOCATE(REIG) + DEALLOCATE(IEIG) + DEALLOCATE(REIGQ) + DEALLOCATE(IEIGQ) + DEALLOCATE(REIGA) + DEALLOCATE(IEIGA) + DEALLOCATE(VA) + DEALLOCATE(LAMBDA) + DEALLOCATE(LAMBDAQ) + DEALLOCATE(EIGA) + DEALLOCATE(W) + DEALLOCATE(AU) + DEALLOCATE(S) + +!............................................................ + ! Generate random M-by-M matrix A. Use DLATMR from + END DO ! LLOOP + + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for DGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV + END IF + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> DGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> DGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + IF ( TEST_QRDMD ) THEN + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for DGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> DGEDMD and DGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'DGEDMD and DGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> DGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> DGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/schkdmd.f90 b/lapack-netlib/TESTING/EIG/schkdmd.f90 new file mode 100644 index 000000000..77e3e46c0 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/schkdmd.f90 @@ -0,0 +1,792 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! SGEDMD for computation of the +! Dynamic Mode Decomposition (DMD) +! SGEDMDQ for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!... ......................................................... +! NOTE: +! When using the Intel MKL 2022.0.3 the subroutine xGESVDQ +! (optionally used in xGEDMD) may cause access violation +! error for x = S, D, C, Z, but only if called with the +! work space query. (At least in our Windows 10 MSVS 2019.) +! The problem can be mitigated by downloading the source +! code of xGESVDQ from the LAPACK repository and use it +! localy instead of the one in the MKL. This seems to +! indicate that the problem is indeed in the MKL. +! This problem did not appear whith Intel MKL 2022.2.0. +! +! NOTE: +! xGESDD seems to have a problem with workspace. In some +! cases the length of the optimal workspace is returned +! smaller than the minimal workspace, as specified in the +! code. As a precaution, all optimal workspaces are +! set as MAX(minimal, optimal). +! Latest implementations of complex xGESDD have different +! length of the real worksapce. We use max value over +! two versions. +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + use iso_fortran_env, only: real32 + IMPLICIT NONE + integer, parameter :: WP = real32 + +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: & + A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,& + Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1 + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: & + DA, DL, DR, REIG, REIGA, REIGQ, IEIG, & + IEIGA, IEIGQ, RES, RES1, RESEX, SINGVX,& + SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: AB(2,2), WDUMMY(2) + INTEGER :: IDUMMY(2), ISEED(4), RJOBDATA(8) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_ZXW, & + TMP_EX, XNORM, YNORM +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, KDIFF, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD + INTEGER iNRNK, iWHTSVD, K_TRAJ, LWMINOPT + CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + + LOGICAL TEST_QRDMD +!..... external subroutines (BLAS and LAPACK) + EXTERNAL SAXPY, SGEEV, SGEMM, SGEMV, SLACPY, SLASCL + EXTERNAL SLARNV, SLATMR +!.....external subroutines DMD package, part 1 +! subroutines under test + EXTERNAL SGEDMD, SGEDMDQ + +!..... external functions (BLAS and LAPACK) + EXTERNAL SLAMCH, SLANGE, SNRM2 + REAL(KIND=WP) :: SLAMCH, SLANGE, SNRM2 + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX +!............................................................ + + ! The test is always in pairs : ( SGEDMD and SGEDMDQ ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + TEST_QRDMD = .TRUE. ! This code by default performs tests on SGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = SLAMCH( 'P' ) ! machine precision SP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + KDIFF = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + WRITE(*,*) 'M = ' + READ(*,*) M + WRITE(*,*) M + ! ... and the number of snapshots. + WRITE(*,*) 'N = ' + READ(*,*) N + WRITE(*,*) N + + ! ... Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = MAX(M,N+1) + LDY = MAX(M,N+1) + LDW = N + LDZ = M + LDAU = MAX(M,N+1) + LDS = N + + TMP_ZXW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ! + ! Test the subroutines on real data snapshots. All + ! computation is done in real arithmetic, even when + ! Koopman eigenvalues and modes are real. + ! + ! Allocate memory space + ALLOCATE( A(LDA,M) ) + ALLOCATE( AC(LDA,M) ) + ALLOCATE( DA(M) ) + ALLOCATE( DL(M) ) + ALLOCATE( F(LDF,N+1) ) + ALLOCATE( F1(LDF,N+1) ) + ALLOCATE( F2(LDF,N+1) ) + ALLOCATE( X(LDX,N) ) + ALLOCATE( X0(LDX,N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + ALLOCATE( Y(LDY,N+1) ) + ALLOCATE( Y0(LDY,N+1) ) + ALLOCATE( Y1(M,N+1) ) + ALLOCATE( Z(LDZ,N) ) + ALLOCATE( Z1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( REIG(N) ) + ALLOCATE( IEIG(N) ) + ALLOCATE( REIGQ(N) ) + ALLOCATE( IEIGQ(N) ) + ALLOCATE( REIGA(M) ) + ALLOCATE( IEIGA(M) ) + ALLOCATE( VA(LDA,M) ) + ALLOCATE( LAMBDA(N,2) ) + ALLOCATE( LAMBDAQ(N,2) ) + ALLOCATE( EIGA(M,2) ) + ALLOCATE( W(LDW,N) ) + ALLOCATE( AU(LDAU,N) ) + ALLOCATE( S(N,N) ) + + TOL = M*EPS + ! This mimics O(M*N)*EPS bound for accumulated roundoff error. + ! The factor 10 is somewhat arbitrary. + TOL2 = 10*M*N*EPS + +!............. + + DO K_TRAJ = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D8 + DMAX = 1.0D2 + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D2 + MODER = 6 + CONDR = 1.0D2 + PIVTNG = 'N' + + ! Loop over all parameter MODE values for ZLATMR (+1,..,+6) + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE(DR(N)) + CALL SLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, & + DMAX, RSIGN, GRADE, DL, MODEL, CONDL, & + DR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO ) + DEALLOCATE(IWORK) + DEALLOCATE(DR) + + LWORK = 4*M+1 + ALLOCATE(WORK(LWORK)) + AC = A + CALL SGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, & + VA, M, WORK, LWORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + TMP = ZERO + DO i = 1, M + EIGA(i,1) = REIGA(i) + EIGA(i,2) = IEIGA(i) + TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2)) + END DO + + ! Scale A to have the desirable spectral radius. + CALL SLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO ) + CALL SLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO ) + + ! Compute the norm of A + ANORM = SLANGE( 'F', N, N, A, M, WDUMMY ) + + IF ( K_TRAJ == 2 ) THEN + ! generate data with two inital conditions + CALL SLARNV(2, ISEED, M, F1(1,1) ) + F1(1:M,1) = 1.0E-10*F1(1:M,1) + DO i = 1, N/2 + CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,1:N/2) = F1(1:M,1:N/2) + Y0(1:M,1:N/2) = F1(1:M,2:N/2+1) + + CALL SLARNV(2, ISEED, M, F1(1,1) ) + DO i = 1, N-N/2 + CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, & + F1(1,i+1), 1 ) + END DO + X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2) + Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1) + ELSE + ! single trajectory + CALL SLARNV(2, ISEED, M, F(1,1) ) + DO i = 1, N + CALL SGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, & + F(1,i+1), 1 ) + END DO + X0(1:M,1:N) = F(1:M,1:N) + Y0(1:M,1:N) = F(1:M,2:N+1) + END IF + + XNORM = SLANGE( 'F', M, N, X0, LDX, WDUMMY ) + YNORM = SLANGE( 'F', M, N, Y0, LDX, WDUMMY ) +!............................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' ! Ritz vectors will be computed + RESIDS = 'R' ! Residuals will be computed + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' ! Ritz vectors in factored form + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' ! Data for refined Ritz vectors + CASE(2) + JOBREF = 'E' ! Exact DMD vectors + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' ! X data normalized + CASE(2) + SCALE = 'C' ! X normalized, consist. check + CASE(3) + SCALE = 'Y' ! Y data normalized + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + ! Two truncation strategies. The "-2" case for R&D + ! purposes only - it uses possibly low accuracy small + ! singular values, in which case the formulas used in + ! the DMD are highly sensitive. + NRNK = iNRNK + + DO iWHTSVD = 1, 4 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + X(1:M,1:N) = X0(1:M,1:N) + Y(1:M,1:N) = Y0(1:M,1:N) + + ! SGEDMD: Workspace query and workspace allocation + CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, & + IDUMMY, -1, INFO ) + + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE( WORK(LWORK) ) + + ! SGEDMD test: CALL SGEDMD + CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, & + N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, & + LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,& + IWORK, LIWORK, INFO ) + + SINGVX(1:N) = WORK(1:N) + + !...... SGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from SGEDMD + ! This checks that the returned aigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the rayleigh quotient + ! returned in W + CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL SAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1) + TMP = MAX(TMP, SNRM2( M, Z1(1,i), 1 ) ) + END DO + TMP_ZXW = MAX(TMP_ZXW, TMP ) + + IF ( TMP_ZXW > 10*M*EPS ) THEN + NFAIL_Z_XV = NFAIL_Z_XV + 1 + END IF + + END IF + + !...... SGEDMD check point + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, & + ZERO, Z1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL SAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1) + TMP = MAX( TMP, SNRM2( M, Z1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_AU = NFAIL_AU + 1 + END IF + + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + + CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M ) + i=1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 ) + RESEX(i) = SNRM2( M, Y1(1,i), 1) / SNRM2(M,AU(1,i),1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RESEX(i) = SLANGE( 'F', M, 2, Y1(1,i), M, & + WORK )/ SLANGE( 'F', M, 2, AU(1,i), M, & + WORK ) + RESEX(i+1) = RESEX(i) + i = i + 2 + END IF + END DO + + END IF + + !...... SGEDMD check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by SGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in SGEDMD,) + i = 1 + DO WHILE ( i <= K ) + IF ( IEIG(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 ) + RES1(i) = SNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIG(i) + AB(2,1) = -IEIG(i) + AB(1,2) = IEIG(i) + AB(2,2) = REIG(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) + RES1(i) = SLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + + IF ( TMP > TOL2 ) THEN + NFAIL_REZ = NFAIL_REZ + 1 + END IF + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + ! ... store the results for inspection + DO i = 1, K + LAMBDA(i,1) = REIG(i) + LAMBDA(i,2) = IEIG(i) + END DO + + DEALLOCATE(IWORK) + DEALLOCATE(WORK) + + !====================================================================== + ! Now test the SGEDMDQ, if requested. + !====================================================================== + IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN + RJOBDATA(2) = 1 + F1 = F + + ! SGEDMDQ test: Workspace query and workspace allocation + CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, & + -1, IDUMMY, -1, INFO ) + LIWORK = IDUMMY(1) + ALLOCATE( IWORK(LIWORK) ) + LWORK = INT(WDUMMY(LWMINOPT)) + ALLOCATE(WORK(LWORK)) + + ! SGEDMDQ test: CALL SGEDMDQ + CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, & + JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, & + LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ, & + RES, AU, LDAU, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ) + + !..... SGEDMDQ check point + IF ( KQ /= K ) THEN + KDIFF = KDIFF+1 + END IF + + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > M*N*EPS ) THEN + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + END IF + + !..... SGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + F2 = F + CALL SGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, & + LDF, Y, LDY, ONE, F2, LDF ) + TMP_FQR = SLANGE( 'F', M, N+1, F2, LDF, WORK ) / & + SLANGE( 'F', M, N+1, F, LDF, WORK ) + IF ( TMP_FQR > TOL2 ) THEN + NFAIL_F_QR = NFAIL_F_QR + 1 + END IF + END IF + + !..... SGEDMDQ checkpoint + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by SGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL SGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in SGEDMDQ) + i = 1 + DO WHILE ( i <= KQ ) + IF ( IEIGQ(i) == ZERO ) THEN + ! have a real eigenvalue with real eigenvector + CALL SAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = SNRM2( M, Y1(1,i), 1) + i = i + 1 + ELSE + ! Have a complex conjugate pair + ! REIG(i) +- sqrt(-1)*IMEIG(i). + ! Since all computation is done in real + ! arithmetic, the formula for the residual + ! is recast for real representation of the + ! complex conjugate eigenpair. See the + ! description of RES. + AB(1,1) = REIGQ(i) + AB(2,1) = -IEIGQ(i) + AB(1,2) = IEIGQ(i) + AB(2,2) = REIGQ(i) + CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), & + M, AB, 2, ONE, Y1(1,i), M ) ! BLAS CALL + ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB ! INTRINSIC + RES1(i) = SLANGE( 'F', M, 2, Y1(1,i), M, & + WORK ) ! LAPACK CALL + RES1(i+1) = RES1(i) + i = i + 2 + END IF + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(K)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP > TOL2 ) THEN + NFAIL_REZQ = NFAIL_REZQ + 1 + END IF + + END IF + + DO i = 1, KQ + LAMBDAQ(i,1) = REIGQ(i) + LAMBDAQ(i,2) = IEIGQ(i) + END DO + + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + END IF ! TEST_QRDMD +!====================================================================== + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! WHTSVD LOOP + !write(*,*) 'WHTSVD loop completed' + END DO ! NRNK LOOP + !write(*,*) 'NRNK loop completed' + END DO ! SCALE LOOP + !write(*,*) 'SCALE loop completed' + END DO ! JOBF LOOP + !write(*,*) 'JOBREF loop completed' + END DO ! JOBZ LOOP + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE(A) + DEALLOCATE(AC) + DEALLOCATE(DA) + DEALLOCATE(DL) + DEALLOCATE(F) + DEALLOCATE(F1) + DEALLOCATE(F2) + DEALLOCATE(X) + DEALLOCATE(X0) + DEALLOCATE(SINGVX) + DEALLOCATE(SINGVQX) + DEALLOCATE(Y) + DEALLOCATE(Y0) + DEALLOCATE(Y1) + DEALLOCATE(Z) + DEALLOCATE(Z1) + DEALLOCATE(RES) + DEALLOCATE(RES1) + DEALLOCATE(RESEX) + DEALLOCATE(REIG) + DEALLOCATE(IEIG) + DEALLOCATE(REIGQ) + DEALLOCATE(IEIGQ) + DEALLOCATE(REIGA) + DEALLOCATE(IEIGA) + DEALLOCATE(VA) + DEALLOCATE(LAMBDA) + DEALLOCATE(LAMBDAQ) + DEALLOCATE(EIGA) + DEALLOCATE(W) + DEALLOCATE(AU) + DEALLOCATE(S) + +!............................................................ + ! Generate random M-by-M matrix A. Use DLATMR from + END DO ! LLOOP + + + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for SGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV + END IF + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> SGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> SGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + IF ( TEST_QRDMD ) THEN + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for SGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> SGEDMD and SGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'SGEDMD and SGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> SGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> SGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END diff --git a/lapack-netlib/TESTING/EIG/zchkdmd.f90 b/lapack-netlib/TESTING/EIG/zchkdmd.f90 new file mode 100644 index 000000000..873d956c4 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/zchkdmd.f90 @@ -0,0 +1,745 @@ +! This is a test program for checking the implementations of +! the implementations of the following subroutines +! +! ZGEDMD, for computation of the +! Dynamic Mode Decomposition (DMD) +! ZGEDMDQ, for computation of a +! QR factorization based compressed DMD +! +! Developed and supported by: +! =========================== +! Developed and coded by Zlatko Drmac, Faculty of Science, +! University of Zagreb; drmac@math.hr +! In cooperation with +! AIMdyn Inc., Santa Barbara, CA. +! ======================================================== +! How to run the code (compiler, link info) +! ======================================================== +! Compile as FORTRAN 90 (or later) and link with BLAS and +! LAPACK libraries. +! NOTE: The code is developed and tested on top of the +! Intel MKL library (versions 2022.0.3 and 2022.2.0), +! using the Intel Fortran compiler. +! +! For developers of the C++ implementation +! ======================================================== +! See the LAPACK++ and Template Numerical Toolkit (TNT) +! +! Note on a development of the GPU HP implementation +! ======================================================== +! Work in progress. See CUDA, MAGMA, SLATE. +! NOTE: The four SVD subroutines used in this code are +! included as a part of R&D and for the completeness. +! This was also an opportunity to test those SVD codes. +! If the scaling option is used all four are essentially +! equally good. For implementations on HP platforms, +! one can use whichever SVD is available. +!............................................................ + +!............................................................ +!............................................................ +! + PROGRAM DMD_TEST + use iso_fortran_env, only: real64 + IMPLICIT NONE + integer, parameter :: WP = real64 + +!............................................................ + REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP + REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP + + COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) + COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) +!............................................................ + REAL(KIND=WP), ALLOCATABLE, DIMENSION(:) :: RES, & + RES1, RESEX, SINGVX, SINGVQX, WORK + INTEGER , ALLOCATABLE, DIMENSION(:) :: IWORK + REAL(KIND=WP) :: WDUMMY(2) + INTEGER :: IDUMMY(4), ISEED(4) + REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS, & + TOL, TOL2, SVDIFF, TMP, TMP_AU, & + TMP_FQR, TMP_REZ, TMP_REZQ, TMP_ZXW, & + TMP_EX + +!............................................................ + COMPLEX(KIND=WP) :: ZMAX + INTEGER :: LZWORK + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) :: ZA, ZAC, & + ZAU, ZF, ZF0, ZF1, ZS, ZW, & + ZX, ZX0, ZY, ZY0, ZY1, ZZ, ZZ1 + COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:) :: ZDA, ZDR, & + ZDL, ZEIGS, ZEIGSA, ZWORK + COMPLEX(KIND=WP) :: ZDUMMY(22), ZDUM2X2(2,2) +!............................................................ + INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY, & + LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK, NRNKsp + INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, & + NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ, & + NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, & + NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD, & + WHTSVDsp + INTEGER :: iNRNK, iWHTSVD, K_TRAJ, LWMINOPT + CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN, & + SCALE, RESIDS, WANTQ, WANTR + LOGICAL :: TEST_QRDMD + +!.....external subroutines (BLAS and LAPACK) + EXTERNAL DAXPY, DGEEV, DGEMM, DGEMV, DLACPY, DLASCL + EXTERNAL ZGEEV, ZGEMV, ZLASCL + EXTERNAL ZLARNV, ZLATMR + EXTERNAL ZAXPY, ZGEMM +!.....external subroutines DMD package, part 1 +! subroutines under test + EXTERNAL ZGEDMD, ZGEDMDQ +!.....external functions (BLAS and LAPACK) + EXTERNAL DLAMCH, DZNRM2 + REAL(KIND=WP) :: DLAMCH, DZNRM2 + REAL(KIND=WP) :: ZLANGE + EXTERNAL IZAMAX + INTEGER IZAMAX + EXTERNAL LSAME + LOGICAL LSAME + + INTRINSIC ABS, INT, MIN, MAX, SIGN +!............................................................ + + ! The test is always in pairs : ( ZGEDMD and ZGEDMDQ ) + ! because the test includes comparing the results (in pairs). +!..................................................................................... + TEST_QRDMD = .TRUE. ! This code by default performs tests on ZGEDMDQ + ! Since the QR factorizations based algorithm is designed for + ! single trajectory data, only single trajectory tests will + ! be performed with xGEDMDQ. + WANTQ = 'Q' + WANTR = 'R' +!................................................................................. + + EPS = DLAMCH( 'P' ) ! machine precision DP + + ! Global counters of failures of some particular tests + NFAIL = 0 + NFAIL_REZ = 0 + NFAIL_REZQ = 0 + NFAIL_Z_XV = 0 + NFAIL_F_QR = 0 + NFAIL_AU = 0 + NFAIL_SVDIFF = 0 + NFAIL_TOTAL = 0 + NFAILQ_TOTAL = 0 + + DO LLOOP = 1, 4 + + WRITE(*,*) 'L Loop Index = ', LLOOP + + ! Set the dimensions of the problem ... + WRITE(*,*) 'M = ' + READ(*,*) M + WRITE(*,*) M + ! ... and the number of snapshots. + WRITE(*,*) 'N = ' + READ(*,*) N + WRITE(*,*) N + + ! ... Test the dimensions + IF ( ( MIN(M,N) == 0 ) .OR. ( M < N ) ) THEN + WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.' + STOP + END IF +!............. + ! The seed inside the LLOOP so that each pass can be reproduced easily. + ISEED(1) = 4 + ISEED(2) = 3 + ISEED(3) = 2 + ISEED(4) = 1 + + LDA = M + LDF = M + LDX = M + LDY = M + LDW = N + LDZ = M + LDAU = M + LDS = N + + TMP_ZXW = ZERO + TMP_AU = ZERO + TMP_REZ = ZERO + TMP_REZQ = ZERO + SVDIFF = ZERO + TMP_EX = ZERO + + ALLOCATE( ZA(LDA,M) ) + ALLOCATE( ZAC(LDA,M) ) + ALLOCATE( ZF(LDF,N+1) ) + ALLOCATE( ZF0(LDF,N+1) ) + ALLOCATE( ZF1(LDF,N+1) ) + ALLOCATE( ZX(LDX,N) ) + ALLOCATE( ZX0(LDX,N) ) + ALLOCATE( ZY(LDY,N+1) ) + ALLOCATE( ZY0(LDY,N+1) ) + ALLOCATE( ZY1(LDY,N+1) ) + ALLOCATE( ZAU(LDAU,N) ) + ALLOCATE( ZW(LDW,N) ) + ALLOCATE( ZS(LDS,N) ) + ALLOCATE( ZZ(LDZ,N) ) + ALLOCATE( ZZ1(LDZ,N) ) + ALLOCATE( RES(N) ) + ALLOCATE( RES1(N) ) + ALLOCATE( RESEX(N) ) + ALLOCATE( ZEIGS(N) ) + ALLOCATE( SINGVX(N) ) + ALLOCATE( SINGVQX(N) ) + + TOL = M*EPS + ! This mimics O(M*N)*EPS bound for accumulated roundoff error. + ! The factor 10 is somewhat arbitrary. + TOL2 = 10*M*N*EPS + +!............. + + DO K_TRAJ = 1, 2 + ! Number of intial conditions in the simulation/trajectories (1 or 2) + + COND = 1.0D4 + ZMAX = (1.0D1,1.0D1) + RSIGN = 'F' + GRADE = 'N' + MODEL = 6 + CONDL = 1.0D1 + MODER = 6 + CONDR = 1.0D1 + PIVTNG = 'N' + + ! Loop over all parameter MODE values for ZLATMR (+1,..,+6) + DO MODE = 1, 6 + + ALLOCATE( IWORK(2*M) ) + ALLOCATE( ZDA(M) ) + ALLOCATE( ZDL(M) ) + ALLOCATE( ZDR(M) ) + + CALL ZLATMR( M, M, 'N', ISEED, 'N', ZDA, MODE, COND, & + ZMAX, RSIGN, GRADE, ZDL, MODEL, CONDL, & + ZDR, MODER, CONDR, PIVTNG, IWORK, M, M, & + ZERO, -ONE, 'N', ZA, LDA, IWORK(M+1), INFO ) + DEALLOCATE( ZDR ) + DEALLOCATE( ZDL ) + DEALLOCATE( ZDA ) + DEALLOCATE( IWORK ) + + LZWORK = MAX(1,2*M) + ALLOCATE( ZEIGSA(M) ) + ALLOCATE( ZWORK(LZWORK) ) + ALLOCATE( WORK(2*M) ) + ZAC(1:M,1:M) = ZA(1:M,1:M) + CALL ZGEEV( 'N','N', M, ZAC, LDA, ZEIGSA, ZDUM2X2, 2, & + ZDUM2X2, 2, ZWORK, LZWORK, WORK, INFO ) ! LAPACK CALL + DEALLOCATE(WORK) + DEALLOCATE(ZWORK) + + TMP = ABS(ZEIGSA(IZAMAX(M, ZEIGSA, 1))) ! The spectral radius of ZA + ! Scale the matrix ZA to have unit spectral radius. + CALL ZLASCL( 'G',0, 0, TMP, ONE, M, M, & + ZA, LDA, INFO ) + CALL ZLASCL( 'G',0, 0, TMP, ONE, M, 1, & + ZEIGSA, M, INFO ) + ANORM = ZLANGE( 'F', M, M, ZA, LDA, WDUMMY ) + + IF ( K_TRAJ == 2 ) THEN + ! generate data as two trajectories + ! with two inital conditions + CALL ZLARNV(2, ISEED, M, ZF(1,1) ) + DO i = 1, N/2 + CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1, & + ZZERO, ZF(1,i+1), 1 ) + END DO + ZX0(1:M,1:N/2) = ZF(1:M,1:N/2) + ZY0(1:M,1:N/2) = ZF(1:M,2:N/2+1) + + CALL ZLARNV(2, ISEED, M, ZF(1,1) ) + DO i = 1, N-N/2 + CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1, & + ZZERO, ZF(1,i+1), 1 ) + END DO + ZX0(1:M,N/2+1:N) = ZF(1:M,1:N-N/2) + ZY0(1:M,N/2+1:N) = ZF(1:M,2:N-N/2+1) + ELSE + CALL ZLARNV(2, ISEED, M, ZF(1,1) ) + DO i = 1, N + CALL ZGEMV( 'N', M, M, ZONE, ZA, M, ZF(1,i), 1, & + ZZERO, ZF(1,i+1), 1 ) + END DO + ZF0(1:M,1:N+1) = ZF(1:M,1:N+1) + ZX0(1:M,1:N) = ZF0(1:M,1:N) + ZY0(1:M,1:N) = ZF0(1:M,2:N+1) + END IF + + DEALLOCATE( ZEIGSA ) +!........................................................................ + + DO iJOBZ = 1, 4 + + SELECT CASE ( iJOBZ ) + CASE(1) + JOBZ = 'V' + RESIDS = 'R' + CASE(2) + JOBZ = 'V' + RESIDS = 'N' + CASE(3) + JOBZ = 'F' + RESIDS = 'N' + CASE(4) + JOBZ = 'N' + RESIDS = 'N' + END SELECT + + DO iJOBREF = 1, 3 + + SELECT CASE ( iJOBREF ) + CASE(1) + JOBREF = 'R' + CASE(2) + JOBREF = 'E' + CASE(3) + JOBREF = 'N' + END SELECT + + DO iSCALE = 1, 4 + + SELECT CASE ( iSCALE ) + CASE(1) + SCALE = 'S' + CASE(2) + SCALE = 'C' + CASE(3) + SCALE = 'Y' + CASE(4) + SCALE = 'N' + END SELECT + + DO iNRNK = -1, -2, -1 + NRNK = iNRNK + NRNKsp = iNRNK + + DO iWHTSVD = 1, 3 + ! Check all four options to compute the POD basis + ! via the SVD. + WHTSVD = iWHTSVD + WHTSVDsp = iWHTSVD + + DO LWMINOPT = 1, 2 + ! Workspace query for the minimal (1) and for the optimal + ! (2) workspace lengths determined by workspace query. + + ! ZGEDMD is always tested and its results are also used for + ! comparisons with ZGEDMDQ. + + ZX(1:M,1:N) = ZX0(1:M,1:N) + ZY(1:M,1:N) = ZY0(1:M,1:N) + + CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, ZX, LDX, ZY, LDY, NRNK, TOL, & + K, ZEIGS, ZZ, LDZ, RES, ZAU, LDAU, & + ZW, LDW, ZS, LDS, ZDUMMY, -1, & + WDUMMY, -1, IDUMMY, -1, INFO ) + IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) & + .OR. ( INFO < 0 ) ) THEN + WRITE(*,*) 'Call to ZGEDMD workspace query failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ', & + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS + STOP + END IF + + LZWORK = INT(ZDUMMY(LWMINOPT)) + LWORK = INT(WDUMMY(1)) + LIWORK = IDUMMY(1) + + ALLOCATE(ZWORK(LZWORK)) + ALLOCATE( WORK(LWORK)) + ALLOCATE(IWORK(LIWORK)) + + CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, ZX, LDX, ZY, LDY, NRNK, TOL, & + K, ZEIGS, ZZ, LDZ, RES, ZAU, LDAU, & + ZW, LDW, ZS, LDS, ZWORK, LZWORK, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to ZGEDMD failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + + SINGVX(1:N) = WORK(1:N) + + !...... ZGEDMD check point + IF ( LSAME(JOBZ,'V') ) THEN + ! Check that Z = X*W, on return from ZGEDMD + ! This checks that the returned eigenvectors in Z are + ! the product of the SVD'POD basis returned in X + ! and the eigenvectors of the rayleigh quotient + ! returned in W + CALL ZGEMM( 'N', 'N', M, K, K, ZONE, ZX, LDX, ZW, LDW, & + ZZERO, ZZ1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL ZAXPY( M, -ZONE, ZZ(1,i), 1, ZZ1(1,i), 1) + TMP = MAX(TMP, DZNRM2( M, ZZ1(1,i), 1 ) ) + END DO + TMP_ZXW = MAX(TMP_ZXW, TMP ) + IF ( TMP_ZXW <= 10*M*EPS ) THEN + !WRITE(*,*) ' :) .... OK .........ZGEDMD PASSED.' + ELSE + NFAIL_Z_XV = NFAIL_Z_XV + 1 + WRITE(*,*) ':( .................ZGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + END IF + + + !...... ZGEDMD check point + IF ( LSAME(JOBREF,'R') ) THEN + ! The matrix A*U is returned for computing refined Ritz vectors. + ! Check that A*U is computed correctly using the formula + ! A*U = Y * V * inv(SIGMA). This depends on the + ! accuracy in the computed singular values and vectors of X. + ! See the paper for an error analysis. + ! Note that the left singular vectors of the input matrix X + ! are returned in the array X. + CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZX, LDX, & + ZZERO, ZZ1, LDZ ) + TMP = ZERO + DO i = 1, K + CALL ZAXPY( M, -ZONE, ZAU(1,i), 1, ZZ1(1,i), 1) + TMP = MAX( TMP, DZNRM2( M, ZZ1(1,i),1 ) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_AU = MAX( TMP_AU, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK .........ZGEDMD PASSED.' + ELSE + NFAIL_AU = NFAIL_AU + 1 + WRITE(*,*) ':( .................ZGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + ELSEIF ( LSAME(JOBREF,'E') ) THEN + ! The unscaled vectors of the Exact DMD are computed. + ! This option is included for the sake of completeness, + ! for users who prefer the Exact DMD vectors. The + ! returned vectors are in the real form, in the same way + ! as the Ritz vectors. Here we just save the vectors + ! and test them separately using a Matlab script. + + + CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZAU, LDAU, ZZERO, ZY1, LDY ) + + DO i=1, K + ! have a real eigenvalue with real eigenvector + CALL ZAXPY( M, -ZEIGS(i), ZAU(1,i), 1, ZY1(1,i), 1 ) + RESEX(i) = DZNRM2( M, ZY1(1,i), 1) / DZNRM2(M,ZAU(1,i),1) + END DO + END IF + !...... ZGEDMD check point + + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by ZGEDMD with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in ZGEDMD,) + + DO i=1, K + ! have a real eigenvalue with real eigenvector + CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 ) + RES1(i) = DZNRM2( M, ZY1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVX(K)/(ANORM*SINGVX(1)) ) + END DO + TMP_REZ = MAX( TMP_REZ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) ':) .... OK ..........ZGEDMD PASSED.' + ELSE + NFAIL_REZ = NFAIL_REZ + 1 + WRITE(*,*) ':( ..................ZGEDMD FAILED!', & + 'Check the code for implementation errors.' + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + END IF + + + IF ( LSAME(JOBREF,'E') ) THEN + TMP = ZERO + DO i = 1, K + TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) ) + END DO + TMP_EX = MAX(TMP_EX,TMP) + END IF + + END IF + + DEALLOCATE(ZWORK) + DEALLOCATE(WORK) + DEALLOCATE(IWORK) + + IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN + + ZF(1:M,1:N+1) = ZF0(1:M,1:N+1) + + CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, ZF, LDF, ZX, LDX, ZY, LDY, & + NRNK, TOL, K, ZEIGS, ZZ, LDZ, RES, ZAU, & + LDAU, ZW, LDW, ZS, LDS, ZDUMMY, -1, & + WDUMMY, -1, IDUMMY, -1, INFO ) + + LZWORK = INT(ZDUMMY(LWMINOPT)) + ALLOCATE( ZWORK(LZWORK) ) + LIWORK = IDUMMY(1) + ALLOCATE(IWORK(LIWORK)) + LWORK = INT(WDUMMY(1)) + ALLOCATE(WORK(LWORK)) + + CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, & + WHTSVD, M, N+1, ZF, LDF, ZX, LDX, ZY, LDY, & + NRNK, TOL, KQ, ZEIGS, ZZ, LDZ, RES, ZAU, & + LDAU, ZW, LDW, ZS, LDS, ZWORK, LZWORK, & + WORK, LWORK, IWORK, LIWORK, INFO ) + + IF ( INFO /= 0 ) THEN + WRITE(*,*) 'Call to ZGEDMDQ failed. & + &Check the calling sequence and the code.' + WRITE(*,*) 'The error code is ', INFO + WRITE(*,*) 'The input parameters were ',& + SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, & + M, N, LDX, LDY, NRNK, TOL + STOP + END IF + SINGVQX(1:N) = WORK(1:N) + + !..... ZGEDMDQ check point + + IF ( 1 == 0 ) THEN + ! Comparison of ZGEDMD and ZGEDMDQ singular values disabled + TMP = ZERO + DO i = 1, MIN(K, KQ) + TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / & + SINGVX(1) ) + END DO + SVDIFF = MAX( SVDIFF, TMP ) + IF ( TMP > M*N*EPS ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_SVDIFF = NFAIL_SVDIFF + 1 + DO j =1, 3 + write(*,*) j, SINGVX(j), SINGVQX(j) + read(*,*) + END DO + + END IF + END IF + + !..... ZGEDMDQ check point + IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN + ! Check that the QR factors are computed and returned + ! as requested. The residual ||F-Q*R||_F / ||F||_F + ! is compared to M*N*EPS. + ZF1(1:M,1:N+1) = ZF0(1:M,1:N+1) + CALL ZGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ZONE, ZF, & + LDF, ZY, LDY, ZONE, ZF1, LDF ) + TMP_FQR = ZLANGE( 'F', M, N+1, ZF1, LDF, WORK ) / & + ZLANGE( 'F', M, N+1, ZF0, LDF, WORK ) + IF ( TMP_FQR > TOL2 ) THEN + WRITE(*,*) 'FAILED! Something was wrong with the run.' + NFAIL_F_QR = NFAIL_F_QR + 1 + ELSE + !WRITE(*,*) '........ PASSED.' + END IF + END IF + + !..... ZGEDMDQ check point + IF ( LSAME(RESIDS, 'R') ) THEN + ! Compare the residuals returned by ZGEDMDQ with the + ! explicitly computed residuals using the matrix A. + ! Compute explicitly Y1 = A*Z + CALL ZGEMM( 'N', 'N', M, KQ, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY ) + ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms + ! of the invariant subspaces that correspond to complex conjugate + ! pairs of eigencalues. (See the description of Z in ZGEDMDQ) + + DO i=1, KQ + ! have a real eigenvalue with real eigenvector + CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 ) + ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i) + RES1(i) = DZNRM2( M, ZY1(1,i), 1) + END DO + TMP = ZERO + DO i = 1, KQ + TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * & + SINGVQX(KQ)/(ANORM*SINGVQX(1)) ) + END DO + TMP_REZQ = MAX( TMP_REZQ, TMP ) + IF ( TMP <= TOL2 ) THEN + !WRITE(*,*) '.... OK ........ ZGEDMDQ PASSED.' + ELSE + NFAIL_REZQ = NFAIL_REZQ + 1 + WRITE(*,*) '................ ZGEDMDQ FAILED!', & + 'Check the code for implementation errors.' + STOP + END IF + + END IF + + DEALLOCATE( ZWORK ) + DEALLOCATE( WORK ) + DEALLOCATE( IWORK ) + + END IF ! ZGEDMDQ + +!....................................................................................................... + + END DO ! LWMINOPT + !write(*,*) 'LWMINOPT loop completed' + END DO ! iWHTSVD + !write(*,*) 'WHTSVD loop completed' + END DO ! iNRNK -2:-1 + !write(*,*) 'NRNK loop completed' + END DO ! iSCALE 1:4 + !write(*,*) 'SCALE loop completed' + END DO + !write(*,*) 'JOBREF loop completed' + END DO ! iJOBZ + !write(*,*) 'JOBZ loop completed' + + END DO ! MODE -6:6 + !write(*,*) 'MODE loop completed' + END DO ! 1 or 2 trajectories + !write(*,*) 'trajectories loop completed' + + DEALLOCATE( ZA ) + DEALLOCATE( ZAC ) + DEALLOCATE( ZZ ) + DEALLOCATE( ZF ) + DEALLOCATE( ZF0 ) + DEALLOCATE( ZF1 ) + DEALLOCATE( ZX ) + DEALLOCATE( ZX0 ) + DEALLOCATE( ZY ) + DEALLOCATE( ZY0 ) + DEALLOCATE( ZY1 ) + DEALLOCATE( ZAU ) + DEALLOCATE( ZW ) + DEALLOCATE( ZS ) + DEALLOCATE( ZZ1 ) + DEALLOCATE( RES ) + DEALLOCATE( RES1 ) + DEALLOCATE( RESEX ) + DEALLOCATE( ZEIGS ) + DEALLOCATE( SINGVX ) + DEALLOCATE( SINGVQX ) + + END DO ! LLOOP + + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for ZGEDMD :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + IF ( NFAIL_Z_XV == 0 ) THEN + WRITE(*,*) '>>>> Z - U*V test PASSED.' + ELSE + WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)' + WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV + END IF + IF ( NFAIL_AU == 0 ) THEN + WRITE(*,*) '>>>> A*U test PASSED. ' + ELSE + WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)' + WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU + END IF + + IF ( NFAIL_REZ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ + END IF + + IF ( NFAIL_TOTAL == 0 ) THEN + WRITE(*,*) '>>>> ZGEDMD :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAIL_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>>>>>>>>> ZGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + IF ( TEST_QRDMD ) THEN + WRITE(*,*) + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) ' Test summary for ZGEDMDQ :' + WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>' + WRITE(*,*) + + IF ( NFAIL_SVDIFF == 0 ) THEN + WRITE(*,*) '>>>> ZGEDMD and ZGEDMDQ computed singular & + &values test PASSED.' + ELSE + WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in & + &the singular values unacceptable ', & + NFAIL_SVDIFF, ' times. Test FAILED.' + WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF + END IF + + IF ( NFAIL_F_QR == 0 ) THEN + WRITE(*,*) '>>>> F - Q*R test PASSED.' + ELSE + WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)' + WRITE(*,*) 'The largest relative residual was ', TMP_FQR + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR + END IF + + IF ( NFAIL_REZQ == 0 ) THEN + WRITE(*,*) '>>>> Rezidual computation test PASSED.' + ELSE + WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)' + WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ + WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS + NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ + END IF + + IF ( NFAILQ_TOTAL == 0 ) THEN + WRITE(*,*) '>>>>>>> ZGEDMDQ :: ALL TESTS PASSED.' + ELSE + WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!' + WRITE(*,*) '>>>>>>> ZGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.' + END IF + + END IF + + WRITE(*,*) + WRITE(*,*) 'Test completed.' + STOP + END From fa03e5497a11e4861cf8908aeb09249b39473c9c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Dec 2023 15:43:28 +0100 Subject: [PATCH 454/718] Add tests for the DMD functions (Reference-LAPACK PR 736) --- lapack-netlib/TESTING/Makefile | 43 ++++++++++++++++++++++++++++++---- lapack-netlib/TESTING/cdmd.in | 11 +++++++++ lapack-netlib/TESTING/ddmd.in | 11 +++++++++ lapack-netlib/TESTING/sdmd.in | 11 +++++++++ lapack-netlib/TESTING/zdmd.in | 11 +++++++++ 5 files changed, 83 insertions(+), 4 deletions(-) create mode 100644 lapack-netlib/TESTING/cdmd.in create mode 100644 lapack-netlib/TESTING/ddmd.in create mode 100644 lapack-netlib/TESTING/sdmd.in create mode 100644 lapack-netlib/TESTING/zdmd.in diff --git a/lapack-netlib/TESTING/Makefile b/lapack-netlib/TESTING/Makefile index bdea2bfaa..3963260ac 100644 --- a/lapack-netlib/TESTING/Makefile +++ b/lapack-netlib/TESTING/Makefile @@ -61,6 +61,8 @@ SEIGTST= snep.out \ scsd.out \ slse.out +SDMDEIGTST= sdmd.out + CEIGTST= cnep.out \ csep.out \ cse2.out \ @@ -82,6 +84,8 @@ CEIGTST= cnep.out \ ccsd.out \ clse.out +CDMDEIGTST= cdmd.out + DEIGTST= dnep.out \ dsep.out \ dse2.out \ @@ -103,6 +107,8 @@ DEIGTST= dnep.out \ dcsd.out \ dlse.out +DDMDEIGTST= ddmd.out + ZEIGTST= znep.out \ zsep.out \ zse2.out \ @@ -124,6 +130,7 @@ ZEIGTST= znep.out \ zcsd.out \ zlse.out +ZDMDEIGTST= zdmd.out SLINTST= stest.out @@ -142,10 +149,10 @@ ZLINTST= ztest.out ZLINTSTPROTO= zctest.out ztest_rfp.out .PHONY: single complex double complex16 -single: $(SLINTST) $(SEIGTST) -complex: $(CLINTST) $(CEIGTST) -double: $(DLINTST) $(DEIGTST) -complex16: $(ZLINTST) $(ZEIGTST) +single: $(SLINTST) $(SEIGTST) $(SDMDEIGTST) +complex: $(CLINTST) $(CEIGTST) $(CDMDEIGTST) +double: $(DLINTST) $(DEIGTST) $(DDMDEIGTST) +complex16: $(ZLINTST) $(ZEIGTST) $(ZDMDEIGTST) .PHONY: singleproto complexproto doubleproto complex16proto singleproto: $(SLINTSTPROTO) @@ -297,6 +304,10 @@ scsd.out: csd.in EIG/xeigtsts slse.out: lse.in EIG/xeigtsts @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtsts < lse.in > $@ 2>&1 + +sdmd.out: sdmd.in EIG/xdmdeigtsts + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtsts < sdmd.in > $@ 2>&1 # # ======== COMPLEX EIG TESTS =========================== @@ -379,6 +390,10 @@ ccsd.out: csd.in EIG/xeigtstc clse.out: lse.in EIG/xeigtstc @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtstc < lse.in > $@ 2>&1 + +cdmd.out: cdmd.in EIG/xdmdeigtstc + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtstc < cdmd.in > $@ 2>&1 # # ======== DOUBLE EIG TESTS =========================== @@ -461,6 +476,10 @@ dcsd.out: csd.in EIG/xeigtstd dlse.out: lse.in EIG/xeigtstd @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtstd < lse.in > $@ 2>&1 + +ddmd.out: ddmd.in EIG/xdmdeigtstd + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtstd < ddmd.in > $@ 2>&1 # # ======== COMPLEX16 EIG TESTS =========================== @@ -543,6 +562,10 @@ zcsd.out: csd.in EIG/xeigtstz zlse.out: lse.in EIG/xeigtstz @echo LSE: Testing Constrained Linear Least Squares routines ./EIG/xeigtstz < lse.in > $@ 2>&1 + +zdmd.out: zdmd.in EIG/xdmdeigtstz + @echo DMD: Testing Dynamic Mode Decomposition routines + ./EIG/xdmdeigtstz < zdmd.in > $@ 2>&1 # ============================================================================== LIN/xlintsts: $(FRCLIN) $(FRC) @@ -578,15 +601,27 @@ LIN/xlintstzc: $(FRCLIN) $(FRC) EIG/xeigtsts: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtsts +EIG/xdmdeigtsts: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtsts + EIG/xeigtstc: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtstc +EIG/xdmdeigtstc: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtstc + EIG/xeigtstd: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtstd +EIG/xdmdeigtstd: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtstd + EIG/xeigtstz: $(FRCEIG) $(FRC) $(MAKE) -C EIG xeigtstz +EIG/xdmdeigtstz: $(FRCEIG) $(FRC) + $(MAKE) -C EIG xdmdeigtstz + .PHONY: clean cleantest clean: cleantest cleantest: diff --git a/lapack-netlib/TESTING/cdmd.in b/lapack-netlib/TESTING/cdmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/cdmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/ddmd.in b/lapack-netlib/TESTING/ddmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/ddmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/sdmd.in b/lapack-netlib/TESTING/sdmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/sdmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 diff --git a/lapack-netlib/TESTING/zdmd.in b/lapack-netlib/TESTING/zdmd.in new file mode 100644 index 000000000..42d046e01 --- /dev/null +++ b/lapack-netlib/TESTING/zdmd.in @@ -0,0 +1,11 @@ +10 +5 + +20 +10 + +30 +11 + +50 +20 From c5fa318addf536754df6e1c02d3a3cfb9e3becb9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Dec 2023 15:45:59 +0100 Subject: [PATCH 455/718] Add tests for DMD (Reference-LAPACK PR 736) --- lapack-netlib/lapack_testing.py | 85 +++++++++++++++++---------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index 5582744a0..ae59926b8 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -1,31 +1,29 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- +#!/usr/bin/env python3 ############################################################################### # lapack_testing.py ############################################################################### -from __future__ import print_function from subprocess import Popen, STDOUT, PIPE import os, sys, math import getopt # Arguments try: opts, args = getopt.getopt(sys.argv[1:], "hd:b:srep:t:n", - ["help", "dir", "bin", "short", "run", "error","prec=","test=","number"]) + ["help", "dir=", "bin=", "short", "run", "error","prec=","test=","number"]) except getopt.error as msg: print(msg) print("for help use --help") sys.exit(2) -short_summary=0 -with_file=1 -just_errors = 0 +short_summary = False +with_file = True +just_errors = False prec='x' test='all' -only_numbers=0 +only_numbers = False test_dir='TESTING' bin_dir='bin/Release' @@ -34,10 +32,9 @@ for o, a in opts: print(sys.argv[0]+" [-h|--help] [-d dir |--dir dir] [-s |--short] [-r |--run] [-e |--error] [-p p |--prec p] [-t test |--test test] [-n | --number]") print(" - h is to print this message") print(" - r is to use to run the LAPACK tests then analyse the output (.out files). By default, the script will not run all the LAPACK tests") - print(" - d [dir] is to indicate where is the LAPACK testing directory (.out files). By default, the script will use .") - print(" - b [bin] is to indicate where is the LAPACK binary files are located. By default, the script will use .") + print(" - d [dir] indicates the location of the LAPACK testing directory (.out files). By default, the script will use {:s}.".format(test_dir)) + print(" - b [bin] indicates the location of the LAPACK binary files. By default, the script will use {:s}.".format(bin_dir)) print(" LEVEL OF OUTPUT") - print(" - x is to print a detailed summary") print(" - e is to print only the error summary") print(" - s is to print a short summary") print(" - n is to print the numbers of failing tests (turn on summary mode)") @@ -63,15 +60,14 @@ for o, a in opts: print(" Will return the numbers of failed tests in REAL precision by running the LAPACK Tests then analyzing the output") print(" ./lapack_testing.py -n -p s -t eig ") print(" Will return the numbers of failed tests in REAL precision by analyzing only the LAPACK output of EIGEN testings") - print("Written by Julie Langou (June 2011) ") sys.exit(0) else: if o in ("-s", "--short"): - short_summary = 1 + short_summary = True if o in ("-r", "--run"): - with_file = 0 + with_file = False if o in ("-e", "--error"): - just_errors = 1 + just_errors = True if o in ( '-p', '--prec' ): prec = a if o in ( '-b', '--bin' ): @@ -81,12 +77,12 @@ for o, a in opts: if o in ( '-t', '--test' ): test = a if o in ( '-n', '--number' ): - only_numbers = 1 - short_summary = 1 + only_numbers = True + short_summary = True # process options -abs_bin_dir=os.path.normpath(os.path.join(os.getcwd(),bin_dir)) +abs_bin_dir=os.path.abspath(bin_dir) os.chdir(test_dir) @@ -108,7 +104,7 @@ def run_summary_test( f, cmdline, short_summary): nb_test_illegal=0 nb_test_info=0 - if (with_file): + if with_file: if not os.path.exists(cmdline): error_message=cmdline+" file not found" r=1 @@ -145,16 +141,16 @@ def run_summary_test( f, cmdline, short_summary): whereisrun=words_in_line.index("run)") nb_test_run+=int(words_in_line[whereisrun-2]) if (line.find("out of")!=-1): - if (short_summary==0): print(line, end=' ') + if not short_summary: print(line, end=' ') whereisout= words_in_line.index("out") nb_test_fail+=int(words_in_line[whereisout-1]) if ((line.find("illegal")!=-1) or (line.find("Illegal")!=-1)): - if (short_summary==0):print(line, end=' ') + if not short_summary: print(line, end=' ') nb_test_illegal+=1 if (line.find(" INFO")!=-1): - if (short_summary==0):print(line, end=' ') + if not short_summary: print(line, end=' ') nb_test_info+=1 - if (with_file==1): + if with_file: pipe.close() f.flush(); @@ -169,7 +165,7 @@ try: except IOError: f = sys.stdout -if (short_summary==0): +if not short_summary: print(" ") print("---------------- Testing LAPACK Routines ----------------") print(" ") @@ -203,6 +199,8 @@ elif test=='mixed': range_prec=[1,3] elif test=='rfp': range_test=[18] +elif test=='dmd': + range_test=[20] elif test=='eig': range_test=list(range(16)) else: @@ -219,7 +217,7 @@ for dtype in range_prec: letter = dtypes[0][dtype] name = dtypes[1][dtype] - if (short_summary==0): + if not short_summary: print(" ") print("------------------------- %s ------------------------" % name) print(" ") @@ -231,19 +229,19 @@ for dtype in range_prec: letter+"gd",letter+"sb",letter+"sg", letter+"bb","glm","gqr", "gsv","csd","lse", - letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"), + letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"), ("Nonsymmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem-2-stage", "Singular-Value-Decomposition", "Eigen-Condition","Nonsymmetric-Eigenvalue","Nonsymmetric-Generalized-Eigenvalue-Problem", "Nonsymmetric-Generalized-Eigenvalue-Problem-driver", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Generalized-Problem", "Banded-Singular-Value-Decomposition-routines", "Generalized-Linear-Regression-Model-routines", "Generalized-QR-and-RQ-factorization-routines", "Generalized-Singular-Value-Decomposition-routines", "CS-Decomposition-routines", "Constrained-Linear-Least-Squares-routines", - "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines"), + "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines","Dynamic-Mode-Decomposition"), (letter+"nep", letter+"sep", letter+"se2", letter+"svd", letter+"ec",letter+"ed",letter+"gg", letter+"gd",letter+"sb",letter+"sg", letter+"bb",letter+"glm",letter+"gqr", letter+"gsv",letter+"csd",letter+"lse", - letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"), + letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"), ) @@ -252,22 +250,25 @@ for dtype in range_prec: # NEED TO SKIP SOME PRECISION (namely s and c) FOR PROTO MIXED PRECISION TESTING if dtest==17 and (letter=="s" or letter=="c"): continue - if (with_file==1): + if with_file: cmdbase=dtests[2][dtest]+".out" else: if dtest==16: # LIN TESTS - cmdbase="LIN/xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==17: # PROTO LIN TESTS - cmdbase="LIN/xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==18: # PROTO LIN TESTS - cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + elif dtest==20: + # DMD EIG TESTS + cmdbase="xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" else: # EIG TESTS - cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" - if (not just_errors and not short_summary): + cmdbase="xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + if not just_errors and not short_summary: print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ') # Run the process: either to read the file or run the LAPACK testing nb_test = run_summary_test(f, cmdbase, short_summary) @@ -277,19 +278,19 @@ for dtype in range_prec: list_results[3][dtype]+=nb_test[3] got_error=nb_test[1]+nb_test[2]+nb_test[3] - if (not short_summary): - if (nb_test[0]>0 and just_errors==0): + if not short_summary: + if nb_test[0] > 0 and not just_errors: print("passed: "+str(nb_test[0])) - if (nb_test[1]>0): + if nb_test[1] > 0: print("failing to pass the threshold: "+str(nb_test[1])) - if (nb_test[2]>0): + if nb_test[2] > 0: print("Illegal Error: "+str(nb_test[2])) - if (nb_test[3]>0): + if nb_test[3] > 0: print("Info Error: "+str(nb_test[3])) - if (got_error>0 and just_errors==1): + if got_error > 0 and just_errors: print("ERROR IS LOCATED IN "+name+" "+dtests[1][dtest]+" [ "+cmdbase+" ]") print("") - if (just_errors==0): + if not just_errors: print("") # elif (got_error>0): # print dtests[2][dtest]+".out \t"+str(nb_test[1])+"\t"+str(nb_test[2])+"\t"+str(nb_test[3]) @@ -307,7 +308,7 @@ for dtype in range_prec: list_results[2][4]+=list_results[2][dtype] list_results[3][4]+=list_results[3][dtype] -if only_numbers==1: +if only_numbers: print(str(list_results[1][4])+"\n"+str(list_results[2][4]+list_results[3][4])) else: print(summary) From 226a14c5499e46d65cfd42b7e3b04fdca0844244 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Dec 2023 15:50:06 +0100 Subject: [PATCH 456/718] Restore library path adjustments --- lapack-netlib/TESTING/EIG/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index 5de315b6e..4e7cf4629 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -135,28 +135,28 @@ complex: xeigtstc xdmdeigtstc double: xeigtstd xdmdeigtstd complex16: xeigtstz xdmdeigtstz -xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) +xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) +xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) +xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) +xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) +xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) +xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) +xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ -xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB) +xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ $(SDMDEIGTST): $(FRC) From effb7af2a27f4f346f204223ce2372ba462412da Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Dec 2023 17:55:38 +0100 Subject: [PATCH 457/718] Fix memory leak (Reference-LAPACK PR 953) --- lapack-netlib/TESTING/LIN/cchkaa.F | 2 ++ lapack-netlib/TESTING/LIN/dchkaa.F | 2 ++ lapack-netlib/TESTING/LIN/schkaa.F | 2 ++ lapack-netlib/TESTING/LIN/zchkaa.F | 2 ++ 4 files changed, 8 insertions(+) diff --git a/lapack-netlib/TESTING/LIN/cchkaa.F b/lapack-netlib/TESTING/LIN/cchkaa.F index 474454a51..57d95c741 100644 --- a/lapack-netlib/TESTING/LIN/cchkaa.F +++ b/lapack-netlib/TESTING/LIN/cchkaa.F @@ -1232,6 +1232,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/dchkaa.F b/lapack-netlib/TESTING/LIN/dchkaa.F index 74077eb94..6582cac13 100644 --- a/lapack-netlib/TESTING/LIN/dchkaa.F +++ b/lapack-netlib/TESTING/LIN/dchkaa.F @@ -1076,6 +1076,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/schkaa.F b/lapack-netlib/TESTING/LIN/schkaa.F index 2b9f2ea45..036b13924 100644 --- a/lapack-netlib/TESTING/LIN/schkaa.F +++ b/lapack-netlib/TESTING/LIN/schkaa.F @@ -1070,6 +1070,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) * diff --git a/lapack-netlib/TESTING/LIN/zchkaa.F b/lapack-netlib/TESTING/LIN/zchkaa.F index 57d71833f..f1020f2d8 100644 --- a/lapack-netlib/TESTING/LIN/zchkaa.F +++ b/lapack-netlib/TESTING/LIN/zchkaa.F @@ -1268,6 +1268,8 @@ * DEALLOCATE (A, STAT = AllocateStatus) DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (E, STAT = AllocateStatus) + DEALLOCATE (S, STAT = AllocateStatus) DEALLOCATE (RWORK, STAT = AllocateStatus) DEALLOCATE (WORK, STAT = AllocateStatus) * From 589f2b6466d7439220a72a99f5b385239bd37043 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Dec 2023 20:10:20 +0100 Subject: [PATCH 458/718] Fix search phrase used to count successful tests (Reference-LAPACK PR 954) --- lapack-netlib/lapack_testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index ae59926b8..96fbeb2a6 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -136,7 +136,7 @@ def run_summary_test( f, cmdline, short_summary): for line in pipe.readlines(): f.write(str(line)) words_in_line=line.split() - if (line.find("run")!=-1): + if (line.find("run)")!=-1): # print line whereisrun=words_in_line.index("run)") nb_test_run+=int(words_in_line[whereisrun-2]) From 5f51811728645baae32ad54f7b34a00259e6e8e8 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Tue, 5 Dec 2023 22:43:36 -0800 Subject: [PATCH 459/718] try at new threading model --- CONTRIBUTORS.md | 3 + common_thread.h | 5 +- driver/others/blas_server_win32.c | 139 ++++++++++++++++-------------- 3 files changed, 79 insertions(+), 68 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 71df13634..493747052 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -216,3 +216,6 @@ In chronological order: * Pablo Romero * [2022-08] Fix building from sources for QNX + +* Mark Seminatore + * [2023-11-09] Improve Windows threading performance scaling \ No newline at end of file diff --git a/common_thread.h b/common_thread.h index 6e18d2a8e..9e7dae74a 100644 --- a/common_thread.h +++ b/common_thread.h @@ -111,8 +111,9 @@ typedef struct blas_queue { struct blas_queue *next; #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__) - CRITICAL_SECTION lock; - HANDLE finish; + // CRITICAL_SECTION lock; + // HANDLE finish; + volatile int finished; #else pthread_mutex_t lock; pthread_cond_t finished; diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 5bdfc1276..464e3fa03 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -51,15 +51,19 @@ /* This is a thread implementation for Win32 lazy implementation */ /* Thread server common information */ -typedef struct{ - CRITICAL_SECTION lock; - HANDLE filled; - HANDLE killed; - - blas_queue_t *queue; /* Parameter Pointer */ - int shutdown; /* server shutdown flag */ - -} blas_pool_t; +//typedef struct{ +// CRITICAL_SECTION lock; +// HANDLE filled; +// HANDLE killed; +// +// blas_queue_t *queue; /* Parameter Pointer */ +// int shutdown; /* server shutdown flag */ +// +//} blas_pool_t; + +static blas_queue_t *work_queue = NULL; +static HANDLE kickoff_event = NULL; +static CRITICAL_SECTION queue_lock; /* We need this global for checking if initialization is finished. */ int blas_server_avail = 0; @@ -67,7 +71,7 @@ int blas_server_avail = 0; /* Local Variables */ static BLASULONG server_lock = 0; -static blas_pool_t pool; +//static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; @@ -209,7 +213,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ void *buffer, *sa, *sb; blas_queue_t *queue; DWORD action; - HANDLE handles[] = {pool.filled, pool.killed}; + //HANDLE handles[] = {pool.filled, pool.killed}; /* Each server needs each buffer */ buffer = blas_memory_alloc(2); @@ -225,29 +229,38 @@ static DWORD WINAPI blas_thread_server(void *arg){ #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); #endif - - do { - action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); - } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); - - if (action == WAIT_OBJECT_0 + 1) break; + // event raised when work is added to the queue + WaitForSingleObject(kickoff_event, INFINITE); #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); #endif - EnterCriticalSection(&pool.lock); +#if 1 + EnterCriticalSection(&queue_lock); - queue = pool.queue; - if (queue) pool.queue = queue->next; + queue = work_queue; + if (queue) + work_queue = work_queue->next; - LeaveCriticalSection(&pool.lock); + LeaveCriticalSection(&queue_lock); +#else + volatile work_queue_t* queue_next; + + INT_PTR prev_value; + do { + queue = (volatile work_queue_t*)work_queue; + if (!queue) + break; + + queue_next = (volatile work_queue_t*)queue->next; + prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue); + } while (prev_value != work_item); +#endif if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; - if (pool.queue) SetEvent(pool.filled); - sa = queue -> sa; sb = queue -> sb; @@ -331,14 +344,6 @@ static DWORD WINAPI blas_thread_server(void *arg){ #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); #endif - - EnterCriticalSection(&queue->lock); - - queue -> status = BLAS_STATUS_FINISHED; - - LeaveCriticalSection(&queue->lock); - - SetEvent(queue->finish); } /* Shutdown procedure */ @@ -366,13 +371,10 @@ int blas_thread_init(void){ #endif if (!blas_server_avail){ + // create the kickoff Event + kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); - InitializeCriticalSection(&pool.lock); - pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); - pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); - - pool.shutdown = 0; - pool.queue = NULL; + InitializeCriticalSection(&queue_lock); for(i = 0; i < blas_cpu_number - 1; i++){ blas_threads[i] = CreateThread(NULL, 0, @@ -409,8 +411,6 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ current = queue; while (current) { - InitializeCriticalSection(¤t -> lock); - current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL); current -> position = pos; #ifdef CONSISTENT_FPCSR @@ -418,23 +418,32 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); #endif + current->finished = 0; current = current -> next; pos ++; } - EnterCriticalSection(&pool.lock); + EnterCriticalSection(&queue_lock); - if (pool.queue) { - current = pool.queue; - while (current -> next) current = current -> next; - current -> next = queue; - } else { - pool.queue = queue; + if (!work_queue) + { + work_queue = queue; } + else + { + blas_queue_t *next_item = work_queue; - LeaveCriticalSection(&pool.lock); + // find the end of the work queue + while (next_item) + next_item = next_item->next; - SetEvent(pool.filled); + // add new work to the end + next_item = queue; + } + + LeaveCriticalSection(&queue_lock); + + SetEvent(kickoff_event); return 0; } @@ -449,21 +458,26 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ #ifdef SMP_DEBUG fprintf(STDERR, "Waiting Queue ..\n"); #endif + while (!queue->finished) + YIELDING; - WaitForSingleObject(queue->finish, INFINITE); - - CloseHandle(queue->finish); - DeleteCriticalSection(&queue -> lock); - - queue = queue -> next; - num --; + queue = queue->next; + num--; } #ifdef SMP_DEBUG fprintf(STDERR, "Completely Done.\n\n"); #endif + // if work was added to the queue after this batch we can't sleep the worker threads + // by resetting the event + EnterCriticalSection(&queue_lock); - return 0; + if (work_queue == NULL) + ResetEvent(kickoff_event); + + LeaveCriticalSection(&queue_lock); + + return 0; } /* Execute Threads */ @@ -512,8 +526,6 @@ int BLASFUNC(blas_thread_shutdown)(void){ if (blas_server_avail){ - SetEvent(pool.killed); - for(i = 0; i < blas_num_threads - 1; i++){ // Could also just use WaitForMultipleObjects DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); @@ -528,9 +540,6 @@ int BLASFUNC(blas_thread_shutdown)(void){ CloseHandle(blas_threads[i]); } - CloseHandle(pool.filled); - CloseHandle(pool.killed); - blas_server_avail = 0; } @@ -558,13 +567,11 @@ void goto_set_num_threads(int num_threads) //increased_threads = 1; if (!blas_server_avail){ + // create the kickoff Event + kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); - InitializeCriticalSection(&pool.lock); - pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); - pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + InitializeCriticalSection(&queue_lock); - pool.shutdown = 0; - pool.queue = NULL; blas_server_avail = 1; } From 4ebf814b4258904d1f48bfe427a6727514d9efa6 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Tue, 5 Dec 2023 23:28:37 -0800 Subject: [PATCH 460/718] fix bug failing to mark task as finished. --- driver/others/blas_server_win32.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 464e3fa03..5af1f1a51 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -344,6 +344,9 @@ static DWORD WINAPI blas_thread_server(void *arg){ #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); #endif + + queue->finished = 1; + } /* Shutdown procedure */ From 993ede7c70658870921b446b145310f1b6c1edf6 Mon Sep 17 00:00:00 2001 From: yancheng Date: Mon, 27 Nov 2023 11:30:34 +0800 Subject: [PATCH 461/718] loongarch64: Add optimizations for scal. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dscal_lasx.S | 194 +++++++++++++++++++++ kernel/loongarch64/dscal_lsx.S | 205 +++++++++++++++++++++++ kernel/loongarch64/sscal_lasx.S | 188 +++++++++++++++++++++ kernel/loongarch64/sscal_lsx.S | 194 +++++++++++++++++++++ 6 files changed, 787 insertions(+) create mode 100644 kernel/loongarch64/dscal_lasx.S create mode 100644 kernel/loongarch64/dscal_lsx.S create mode 100644 kernel/loongarch64/sscal_lasx.S create mode 100644 kernel/loongarch64/sscal_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index b2a396674..e553c4b95 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -4,4 +4,7 @@ SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S +SSCALKERNEL = sscal_lsx.S +DSCALKERNEL = dscal_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 020a82303..4c0c1c2c8 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -4,6 +4,9 @@ SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S +SSCALKERNEL = sscal_lasx.S +DSCALKERNEL = dscal_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dscal_lasx.S b/kernel/loongarch64/dscal_lasx.S new file mode 100644 index 000000000..153662378 --- /dev/null +++ b/kernel/loongarch64/dscal_lasx.S @@ -0,0 +1,194 @@ +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VT0 $xr14 +#define VT1 $xr15 +#define VALPHA $xr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + fcmp.ceq.d $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + move XX, X + .align 3 + +.L10: + bge $r0, I, .L32 + .align 3 +.L11: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX1, VALPHA + xvstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 3 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L23: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + xvxor.v VX0, VX0, VX0 + xvst VX0, X, 0 * SIZE + xvst VX0, X, 4 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L27: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA , TEMP + .align 3 + +.L31: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VT0, VX0, VALPHA + xvfmul.d VT1, VX1, VALPHA + addi.d I, I, -1 + xvst VT0, X, 0 * SIZE + xvst VT1, X, 4 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L33: + fld.d a1, X, 0 * SIZE + addi.d I, I, -1 + fmul.d a1, ALPHA, a1 + fst.d a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dscal_lsx.S b/kernel/loongarch64/dscal_lsx.S new file mode 100644 index 000000000..55f497752 --- /dev/null +++ b/kernel/loongarch64/dscal_lsx.S @@ -0,0 +1,205 @@ +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VT0 $vr14 +#define VT1 $vr15 +#define VALPHA $vr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + fcmp.ceq.d $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 + +.L11: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + fst.d a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L23: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + vxor.v VX0, VX0, VX0 + vst VX0, X, 0 * SIZE + vst VX0, X, 2 * SIZE + vst VX0, X, 4 * SIZE + vst VX0, X, 6 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L27: + fst.d a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA , TEMP + .align 3 + +.L31: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vfmul.d VT1, VX1, VALPHA + vld VX0, X, 4 * SIZE + vst VT0, X, 0 * SIZE + vst VT1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vld VX1, X, 6 * SIZE + vst VT0, X, 4 * SIZE + vfmul.d VT1, VX1, VALPHA + vst VT1, X, 6 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L33: + fld.d a1, X, 0 * SIZE + addi.d I, I, -1 + fmul.d a1, ALPHA, a1 + fst.d a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/sscal_lasx.S b/kernel/loongarch64/sscal_lasx.S new file mode 100644 index 000000000..329f24659 --- /dev/null +++ b/kernel/loongarch64/sscal_lasx.S @@ -0,0 +1,188 @@ +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VT0 $xr14 +#define VT1 $xr15 +#define VALPHA $xr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + movgr2fr.d a2, TEMP + ffint.s.l a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + fcmp.ceq.s $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + movfr2gr.s TEMP, ALPHA + xvreplgr2vr.w VALPHA, TEMP + move XX, X + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 +.L11: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VT0, VX0, VALPHA + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L23: + fst.s a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + xvxor.v VX0, VX0, VX0 + xvst VX0, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L27: + fst.s a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + movfr2gr.s TEMP, ALPHA + xvreplgr2vr.w VALPHA , TEMP + .align 3 + +.L31: + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvfmul.s VT0, VX0, VALPHA + xvst VT0, X, 0 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L33: + fld.s a1, X, 0 * SIZE + addi.d I, I, -1 + fmul.s a1, ALPHA, a1 + fst.s a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/sscal_lsx.S b/kernel/loongarch64/sscal_lsx.S new file mode 100644 index 000000000..d0ea1307d --- /dev/null +++ b/kernel/loongarch64/sscal_lsx.S @@ -0,0 +1,194 @@ +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VT0 $vr14 +#define VT1 $vr15 +#define VALPHA $vr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + movgr2fr.d a2, TEMP + ffint.s.l a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + fcmp.ceq.s $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + movfr2gr.s TEMP, ALPHA + vreplgr2vr.w VALPHA, TEMP + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 +.L11: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + vfmul.s VT0, VX0, VALPHA + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vfmul.s VT1, VX1, VALPHA + vstelm.w VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 3 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + fst.s a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L23: + fst.s a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + vxor.v VX0, VX0, VX0 + vst VX0, X, 0 * SIZE + vst VX0, X, 4 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L27: + fst.s a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + movfr2gr.s TEMP, ALPHA + vreplgr2vr.w VALPHA , TEMP + .align 3 + +.L31: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VT0, VX0, VALPHA + vfmul.s VT1, VX1, VALPHA + addi.d I, I, -1 + vst VT0, X, 0 * SIZE + vst VT1, X, 4 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L33: + fld.s a1, X, 0 * SIZE + addi.d I, I, -1 + fmul.s a1, ALPHA, a1 + fst.s a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE From 265b5f2e803d2d66b7fe75ec356dc14fcb5ef726 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 10:57:13 +0800 Subject: [PATCH 462/718] loongarch64: Add optimizations for amax. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/damax_lasx.S | 183 ++++++++++++++++++++ kernel/loongarch64/damax_lsx.S | 145 ++++++++++++++++ kernel/loongarch64/samax_lasx.S | 208 +++++++++++++++++++++++ kernel/loongarch64/samax_lsx.S | 177 +++++++++++++++++++ 6 files changed, 719 insertions(+) create mode 100644 kernel/loongarch64/damax_lasx.S create mode 100644 kernel/loongarch64/damax_lsx.S create mode 100644 kernel/loongarch64/samax_lasx.S create mode 100644 kernel/loongarch64/samax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index e553c4b95..a4d5f8f87 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -7,4 +7,7 @@ DDOTKERNEL = dot_lsx.S SSCALKERNEL = sscal_lsx.S DSCALKERNEL = dscal_lsx.S +SAMAXKERNEL = samax_lsx.S +DAMAXKERNEL = damax_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 4c0c1c2c8..8c7481ae6 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -7,6 +7,9 @@ DDOTKERNEL = dot_lasx.S SSCALKERNEL = sscal_lasx.S DSCALKERNEL = dscal_lasx.S +SAMAXKERNEL = samax_lasx.S +DAMAXKERNEL = damax_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/damax_lasx.S b/kernel/loongarch64/damax_lasx.S new file mode 100644 index 000000000..c44ce4995 --- /dev/null +++ b/kernel/loongarch64/damax_lasx.S @@ -0,0 +1,183 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define J $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define TEMP $r16 +#define m0 $xr8 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define x5 $xr13 +#define x6 $xr14 +#define x7 $xr15 +#define x8 $xr16 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 +#define VM2 $xr18 +#define VM3 $xr19 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + srai.d I, N, 3 + bge $r0, I, .L12 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvfmaxa.d VM1, VX1, VX0 + addi.d X, X, 8 * SIZE + xvfmaxa.d VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfmaxa.d VM2, x3, x4 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L12: //INCX==1 and N<8 + andi I, N, 7 + li.d J, 4 + bge J, I, .L13 // 4 Date: Thu, 7 Dec 2023 11:08:09 +0800 Subject: [PATCH 463/718] loongarch64: Add optimization for amin. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/damin_lasx.S | 178 +++++++++++++++++++ kernel/loongarch64/damin_lsx.S | 145 ++++++++++++++++ kernel/loongarch64/samin_lasx.S | 208 +++++++++++++++++++++++ kernel/loongarch64/samin_lsx.S | 177 +++++++++++++++++++ 6 files changed, 714 insertions(+) create mode 100644 kernel/loongarch64/damin_lasx.S create mode 100644 kernel/loongarch64/damin_lsx.S create mode 100644 kernel/loongarch64/samin_lasx.S create mode 100644 kernel/loongarch64/samin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a4d5f8f87..279ff6a9c 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -10,4 +10,7 @@ DSCALKERNEL = dscal_lsx.S SAMAXKERNEL = samax_lsx.S DAMAXKERNEL = damax_lsx.S +SAMINKERNEL = samin_lsx.S +DAMINKERNEL = damin_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 8c7481ae6..83db79050 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -10,6 +10,9 @@ DSCALKERNEL = dscal_lasx.S SAMAXKERNEL = samax_lasx.S DAMAXKERNEL = damax_lasx.S +SAMINKERNEL = samin_lasx.S +DAMINKERNEL = damin_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/damin_lasx.S b/kernel/loongarch64/damin_lasx.S new file mode 100644 index 000000000..9d96dd997 --- /dev/null +++ b/kernel/loongarch64/damin_lasx.S @@ -0,0 +1,178 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define J $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define TEMP $r16 +#define m0 $xr8 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 +#define VM2 $xr19 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + srai.d I, N, 3 + bge $r0, I, .L12 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvld VX1, X, 4 * SIZE + xvfmina.d VM1, VX1, VX0 + addi.d X, X, 8 * SIZE + xvfmina.d VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfmina.d VM2, x3, x4 + xvfmina.d VM0, VM1, VM2 + .align 3 + +.L12: //INCX==1 and N<8 + andi I, N, 7 + li.d J, 4 + bge J, I, .L13 // 4 Date: Thu, 7 Dec 2023 11:30:02 +0800 Subject: [PATCH 464/718] loongarch64: Add optimization for max. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dmax_lasx.S | 175 +++++++++++++++++++ kernel/loongarch64/dmax_lsx.S | 141 ++++++++++++++++ kernel/loongarch64/smax_lasx.S | 205 +++++++++++++++++++++++ kernel/loongarch64/smax_lsx.S | 171 +++++++++++++++++++ 6 files changed, 698 insertions(+) create mode 100644 kernel/loongarch64/dmax_lasx.S create mode 100644 kernel/loongarch64/dmax_lsx.S create mode 100644 kernel/loongarch64/smax_lasx.S create mode 100644 kernel/loongarch64/smax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 279ff6a9c..e00893b72 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -13,4 +13,7 @@ DAMAXKERNEL = damax_lsx.S SAMINKERNEL = samin_lsx.S DAMINKERNEL = damin_lsx.S +SMAXKERNEL = smax_lsx.S +DMAXKERNEL = dmax_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 83db79050..f238436f5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -13,6 +13,9 @@ DAMAXKERNEL = damax_lasx.S SAMINKERNEL = samin_lasx.S DAMINKERNEL = damin_lasx.S +SMAXKERNEL = smax_lasx.S +DMAXKERNEL = dmax_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dmax_lasx.S b/kernel/loongarch64/dmax_lasx.S new file mode 100644 index 000000000..46366d2ec --- /dev/null +++ b/kernel/loongarch64/dmax_lasx.S @@ -0,0 +1,175 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define J $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define TEMP $r16 +#define m0 $xr8 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 +#define VM2 $xr19 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + srai.d I, N, 3 + bge $r0, I, .L12 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvfmax.d VM1, VX1, VX0 + addi.d X, X, 8 * SIZE + xvfmax.d VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmax.d VM1, x1, x2 + xvfmax.d VM2, x3, x4 + xvfmax.d VM0, VM1, VM2 + .align 3 + +.L12: //INCX==1 and N<8 + andi I, N, 7 + li.d J, 4 + bge J, I, .L13 // 4 Date: Thu, 7 Dec 2023 11:51:19 +0800 Subject: [PATCH 465/718] loongarch64: Add optimization for min. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dmin_lasx.S | 175 +++++++++++++++++++ kernel/loongarch64/dmin_lsx.S | 143 ++++++++++++++++ kernel/loongarch64/smin_lasx.S | 205 +++++++++++++++++++++++ kernel/loongarch64/smin_lsx.S | 174 +++++++++++++++++++ 6 files changed, 703 insertions(+) create mode 100644 kernel/loongarch64/dmin_lasx.S create mode 100644 kernel/loongarch64/dmin_lsx.S create mode 100644 kernel/loongarch64/smin_lasx.S create mode 100644 kernel/loongarch64/smin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index e00893b72..0ff73c2db 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -16,4 +16,7 @@ DAMINKERNEL = damin_lsx.S SMAXKERNEL = smax_lsx.S DMAXKERNEL = dmax_lsx.S +SMINKERNEL = smin_lsx.S +DMINKERNEL = dmin_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index f238436f5..71f53d9d7 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -16,6 +16,9 @@ DAMINKERNEL = damin_lasx.S SMAXKERNEL = smax_lasx.S DMAXKERNEL = dmax_lasx.S +SMINKERNEL = smin_lasx.S +DMINKERNEL = dmin_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dmin_lasx.S b/kernel/loongarch64/dmin_lasx.S new file mode 100644 index 000000000..e76056565 --- /dev/null +++ b/kernel/loongarch64/dmin_lasx.S @@ -0,0 +1,175 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define J $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define TEMP $r16 +#define m0 $xr8 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 +#define VM2 $xr19 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + srai.d I, N, 3 + bge $r0, I, .L12 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvfmin.d VM1, VX1, VX0 + addi.d X, X, 8 * SIZE + xvfmin.d VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmin.d VM1, x1, x2 + xvfmin.d VM2, x3, x4 + xvfmin.d VM0, VM1, VM2 + .align 3 + +.L12: //INCX==1 and N<8 + andi I, N, 7 + li.d J, 4 + bge J, I, .L13 // 4 Date: Thu, 7 Dec 2023 11:56:41 +0800 Subject: [PATCH 466/718] loongarch64: Add optimizations for imax. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/idmax_lasx.S | 273 +++++++++++++++++ kernel/loongarch64/idmax_lsx.S | 225 ++++++++++++++ kernel/loongarch64/ismax_lasx.S | 375 +++++++++++++++++++++++ kernel/loongarch64/ismax_lsx.S | 272 ++++++++++++++++ 6 files changed, 1151 insertions(+) create mode 100644 kernel/loongarch64/idmax_lasx.S create mode 100644 kernel/loongarch64/idmax_lsx.S create mode 100644 kernel/loongarch64/ismax_lasx.S create mode 100644 kernel/loongarch64/ismax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 0ff73c2db..3c201ad89 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -19,4 +19,7 @@ DMAXKERNEL = dmax_lsx.S SMINKERNEL = smin_lsx.S DMINKERNEL = dmin_lsx.S +ISMAXKERNEL = ismax_lsx.S +IDMAXKERNEL = idmax_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 71f53d9d7..953793b9a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -19,6 +19,9 @@ DMAXKERNEL = dmax_lasx.S SMINKERNEL = smin_lasx.S DMINKERNEL = dmin_lasx.S +ISMAXKERNEL = ismax_lasx.S +IDMAXKERNEL = idmax_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/idmax_lasx.S b/kernel/loongarch64/idmax_lasx.S new file mode 100644 index 000000000..bbfe0941a --- /dev/null +++ b/kernel/loongarch64/idmax_lasx.S @@ -0,0 +1,273 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.d VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.d VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.d VT0, VM0, x2 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.d VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.d VT0, VM0, x3 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.d VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.d VT0, VM0, x4 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.d VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.d $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/idmax_lsx.S b/kernel/loongarch64/idmax_lsx.S new file mode 100644 index 000000000..1b4734bab --- /dev/null +++ b/kernel/loongarch64/idmax_lsx.S @@ -0,0 +1,225 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + vfcmp.clt.d VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + vfcmp.clt.d VT0, VM0, x1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX0, VX1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + vfcmp.clt.d VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + vfcmp.clt.d VT0, VM0, x1 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfcmp.clt.d VT0, x1, x2 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.d $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismax_lasx.S b/kernel/loongarch64/ismax_lasx.S new file mode 100644 index 000000000..843dd6c6a --- /dev/null +++ b/kernel/loongarch64/ismax_lasx.S @@ -0,0 +1,375 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VM0, VX0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + +.L15: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VM0, VX0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.s $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismax_lsx.S b/kernel/loongarch64/ismax_lsx.S new file mode 100644 index 000000000..33b326bbd --- /dev/null +++ b/kernel/loongarch64/ismax_lsx.S @@ -0,0 +1,272 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfcmp.clt.s VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + vfcmp.clt.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v VINC4, VI1, VI2, VT0 + vfcmp.clt.s VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v VINC8, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfcmp.clt.s VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v VINC4, VI1, VI2, VT0 + vfcmp.clt.s VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v VINC8, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + vfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + vfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + vfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.s $fcc0, $f15, $f9 + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file From e3fb2b5afa63fc0db472268adfd7038146ea6ac6 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 12:01:05 +0800 Subject: [PATCH 467/718] loongarch64: Add optimizations for imin. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/idmin_lasx.S | 272 +++++++++++++++++ kernel/loongarch64/idmin_lsx.S | 225 ++++++++++++++ kernel/loongarch64/ismin_lasx.S | 374 +++++++++++++++++++++++ kernel/loongarch64/ismin_lsx.S | 271 ++++++++++++++++ 6 files changed, 1148 insertions(+) create mode 100644 kernel/loongarch64/idmin_lasx.S create mode 100644 kernel/loongarch64/idmin_lsx.S create mode 100644 kernel/loongarch64/ismin_lasx.S create mode 100644 kernel/loongarch64/ismin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 3c201ad89..d0eab3d7c 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -22,4 +22,7 @@ DMINKERNEL = dmin_lsx.S ISMAXKERNEL = ismax_lsx.S IDMAXKERNEL = idmax_lsx.S +ISMINKERNEL = ismin_lsx.S +IDMINKERNEL = idmin_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 953793b9a..92389001b 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -22,6 +22,9 @@ DMINKERNEL = dmin_lasx.S ISMAXKERNEL = ismax_lasx.S IDMAXKERNEL = idmax_lasx.S +ISMINKERNEL = ismin_lasx.S +IDMINKERNEL = idmin_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/idmin_lasx.S b/kernel/loongarch64/idmin_lasx.S new file mode 100644 index 000000000..7930d4963 --- /dev/null +++ b/kernel/loongarch64/idmin_lasx.S @@ -0,0 +1,272 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.d VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.d VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.d VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.d VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.d VT0, VM0, x2 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.d VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.d VT0, VM0, x3 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.d VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.d VT0, VM0, x4 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.d VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.d $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/idmin_lsx.S b/kernel/loongarch64/idmin_lsx.S new file mode 100644 index 000000000..8b6edcbf0 --- /dev/null +++ b/kernel/loongarch64/idmin_lsx.S @@ -0,0 +1,225 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + vfcmp.clt.d VT0, x3, x1 + addi.d X, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + vfcmp.clt.d VT0, x1, VM0 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfcmp.clt.d VT0, VX1, VX0 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + vfcmp.clt.d VT0, x3, x1 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + vfcmp.clt.d VT0, x1, VM0 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfcmp.clt.d VT0, x2, x1 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.d $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismin_lasx.S b/kernel/loongarch64/ismin_lasx.S new file mode 100644 index 000000000..15f6e2ec9 --- /dev/null +++ b/kernel/loongarch64/ismin_lasx.S @@ -0,0 +1,374 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VX0, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + +.L15: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, x1, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfcmp.clt.s VT0, VX0, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM0, VX0, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x2, x1 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x2, x1 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x4, x3 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, x1, VM0 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f13, $f15 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + fcmp.clt.s $fcc0, $f9, $f15 + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + add.d X, X, INCX + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismin_lsx.S b/kernel/loongarch64/ismin_lsx.S new file mode 100644 index 000000000..f90ebbd57 --- /dev/null +++ b/kernel/loongarch64/ismin_lsx.S @@ -0,0 +1,271 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfcmp.clt.s VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + vfcmp.clt.s VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v VINC4, VI1, VI2, VT0 + vfcmp.clt.s VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v VINC8, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, x1, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfcmp.clt.s VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + vfcmp.clt.s VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v VINC4, VI1, VI2, VT0 + vfcmp.clt.s VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v VINC8, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, x1, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + vfcmp.ceq.s VT0, x2, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + vfcmp.ceq.s VT0, x3, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + vfcmp.ceq.s VT0, x4, VM0 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + fcmp.clt.s $fcc0, $f9, $f15 + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d I, I, -1 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file From be83f5e4e08bfc5a7d94d5ed659368ca077c4374 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 12:07:30 +0800 Subject: [PATCH 468/718] loongarch64: Add optimizations for iamax. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/idamax_lasx.S | 275 +++++++++++++++++ kernel/loongarch64/idamax_lsx.S | 267 ++++++++++++++++ kernel/loongarch64/isamax_lasx.S | 378 +++++++++++++++++++++++ kernel/loongarch64/isamax_lsx.S | 275 +++++++++++++++++ 6 files changed, 1201 insertions(+) create mode 100644 kernel/loongarch64/idamax_lasx.S create mode 100644 kernel/loongarch64/idamax_lsx.S create mode 100644 kernel/loongarch64/isamax_lasx.S create mode 100644 kernel/loongarch64/isamax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index d0eab3d7c..a4c8927ba 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -25,4 +25,7 @@ IDMAXKERNEL = idmax_lsx.S ISMINKERNEL = ismin_lsx.S IDMINKERNEL = idmin_lsx.S +ISAMAXKERNEL = isamax_lsx.S +IDAMAXKERNEL = idamax_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 92389001b..835b027b8 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -25,6 +25,9 @@ IDMAXKERNEL = idmax_lasx.S ISMINKERNEL = ismin_lasx.S IDMINKERNEL = idmin_lasx.S +ISAMAXKERNEL = isamax_lasx.S +IDAMAXKERNEL = idamax_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/idamax_lasx.S b/kernel/loongarch64/idamax_lasx.S new file mode 100644 index 000000000..8248ee757 --- /dev/null +++ b/kernel/loongarch64/idamax_lasx.S @@ -0,0 +1,275 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.d VT0, VM0, x2 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.d VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.d VT0, VM0, x3 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.d VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.d VT0, VM0, x4 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.d VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + xvfmaxa.d VM1, x1, VM0 + xvfcmp.ceq.d VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamax_lsx.S b/kernel/loongarch64/idamax_lsx.S new file mode 100644 index 000000000..fb2d5bac1 --- /dev/null +++ b/kernel/loongarch64/idamax_lsx.S @@ -0,0 +1,267 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L16 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L17 + .align 3 + +.L16: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + vfmaxa.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + vfmaxa.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/isamax_lasx.S b/kernel/loongarch64/isamax_lasx.S new file mode 100644 index 000000000..2800b1d43 --- /dev/null +++ b/kernel/loongarch64/isamax_lasx.S @@ -0,0 +1,378 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvadd.w VI1, VI1, VINC8 + xvfmaxa.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfmaxa.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM1, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + xvfmaxa.s VM0, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VX0 + xvbitsel.v VI0, VI0, VI1, VT0 + movfr2gr.s i0, $f20 + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.s $f9, X, 0 + addi.d I, I, -1 + xvfmaxa.s VM1, x1, VM0 + xvfcmp.ceq.s VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/isamax_lsx.S b/kernel/loongarch64/isamax_lsx.S new file mode 100644 index 000000000..a18aa7354 --- /dev/null +++ b/kernel/loongarch64/isamax_lsx.S @@ -0,0 +1,275 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + vfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + vfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + vfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.s $f9, X, 0 + addi.d I, I, -1 + vfmaxa.s VM1, x1, VM0 + vfcmp.ceq.s VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 49829b2b7db5d4ad78447c47391e8c0c96ffe803 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 12:11:30 +0800 Subject: [PATCH 469/718] loongarch64: Add optimizations for iamin. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/idamin_lasx.S | 275 +++++++++++++++++ kernel/loongarch64/idamin_lsx.S | 228 ++++++++++++++ kernel/loongarch64/isamin_lasx.S | 378 +++++++++++++++++++++++ kernel/loongarch64/isamin_lsx.S | 275 +++++++++++++++++ 6 files changed, 1162 insertions(+) create mode 100644 kernel/loongarch64/idamin_lasx.S create mode 100644 kernel/loongarch64/idamin_lsx.S create mode 100644 kernel/loongarch64/isamin_lasx.S create mode 100644 kernel/loongarch64/isamin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a4c8927ba..bff52ce93 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -28,4 +28,7 @@ IDMINKERNEL = idmin_lsx.S ISAMAXKERNEL = isamax_lsx.S IDAMAXKERNEL = idamax_lsx.S +ISAMINKERNEL = isamin_lsx.S +IDAMINKERNEL = idamin_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 835b027b8..a08598cc5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -28,6 +28,9 @@ IDMINKERNEL = idmin_lasx.S ISAMAXKERNEL = isamax_lasx.S IDAMAXKERNEL = idamax_lasx.S +ISAMINKERNEL = isamin_lasx.S +IDAMINKERNEL = idamin_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/idamin_lasx.S b/kernel/loongarch64/idamin_lasx.S new file mode 100644 index 000000000..6ef1e8903 --- /dev/null +++ b/kernel/loongarch64/idamin_lasx.S @@ -0,0 +1,275 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d I, I, -1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + xvfcmp.ceq.d VT0, VM0, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.d VT0, VM0, x2 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.d VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.d VT0, VM0, x3 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.d VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.d VT0, VM0, x4 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.d VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.d i0, $f20 + .align 3 + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + xvfmina.d VM1, x1, VM0 + xvfcmp.ceq.d VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamin_lsx.S b/kernel/loongarch64/idamin_lsx.S new file mode 100644 index 000000000..9eb9d883f --- /dev/null +++ b/kernel/loongarch64/idamin_lsx.S @@ -0,0 +1,228 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 + vfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.d $f17, TEMP + ffint.d.l $f17, $f17 + vfcmp.ceq.d VT0, x2, x1 + fcmp.ceq.d $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + vfmina.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.d i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/isamin_lasx.S b/kernel/loongarch64/isamin_lasx.S new file mode 100644 index 000000000..cbdf32530 --- /dev/null +++ b/kernel/loongarch64/isamin_lasx.S @@ -0,0 +1,378 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvadd.w VI1, VI1, VINC8 + xvfmina.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmina.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t2, 1 + xvinsgr2vr.w VM0, t3, 2 + xvinsgr2vr.w VM0, t4, 3 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.w VM0, t1, 4 + xvinsgr2vr.w VM0, t2, 5 + xvinsgr2vr.w VM0, t3, 6 + xvinsgr2vr.w VM0, t4, 7 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvadd.w VI1, VI1, VINC8 + xvfmina.s VM1, VX0, VM0 + xvfcmp.ceq.s VT0, VM1, VM0 + addi.d I, I, -1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmina.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfmina.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfmina.s VM1, x1, x2 + xvfcmp.ceq.s VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + xvfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + xvfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + xvfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + xvfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + xvfmina.s VM0, VX0, VM0 + xvfcmp.ceq.s VT0, VM0, VX0 + xvbitsel.v VI0, VI0, VI1, VT0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.s $f9, X, 0 + addi.d I, I, -1 + xvfmina.s VM1, x1, VM0 + xvfcmp.ceq.s VT0, VM0, VM1 + add.d X, X, INCX + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/isamin_lsx.S b/kernel/loongarch64/isamin_lsx.S new file mode 100644 index 000000000..598888660 --- /dev/null +++ b/kernel/loongarch64/isamin_lsx.S @@ -0,0 +1,275 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + li.d TEMP, 1 //处理尾数相等时取最小序号 + movgr2fr.w $f17, TEMP + ffint.s.w $f17, $f17 + vfcmp.ceq.s VT0, VM0, x1 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + vfcmp.ceq.s VT0, VM0, x2 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + vfcmp.ceq.s VT0, VM0, x3 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + vfcmp.ceq.s VT0, VM0, x4 + fcmp.ceq.s $fcc0, $f23, $f17 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.s $f9, X, 0 + addi.d I, I, -1 + vfmina.s VM1, x1, VM0 + vfcmp.ceq.s VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file From 174c25766b6b43e36a9ace6da26fe21f3a7305a9 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 12:15:46 +0800 Subject: [PATCH 470/718] loongarch64: Add optimizations for copy. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dcopy_lasx.S | 224 ++++++++++++++++++++++ kernel/loongarch64/dcopy_lsx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/scopy_lasx.S | 216 +++++++++++++++++++++ kernel/loongarch64/scopy_lsx.S | 220 +++++++++++++++++++++ 6 files changed, 898 insertions(+) create mode 100644 kernel/loongarch64/dcopy_lasx.S create mode 100644 kernel/loongarch64/dcopy_lsx.S create mode 100644 kernel/loongarch64/scopy_lasx.S create mode 100644 kernel/loongarch64/scopy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index bff52ce93..565bec0f2 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -31,4 +31,7 @@ IDAMAXKERNEL = idamax_lsx.S ISAMINKERNEL = isamin_lsx.S IDAMINKERNEL = idamin_lsx.S +SCOPYKERNEL = scopy_lsx.S +DCOPYKERNEL = dcopy_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index a08598cc5..a2443720b 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -31,6 +31,9 @@ IDAMAXKERNEL = idamax_lasx.S ISAMINKERNEL = isamin_lasx.S IDAMINKERNEL = idamin_lasx.S +SCOPYKERNEL = scopy_lasx.S +DCOPYKERNEL = dcopy_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dcopy_lasx.S b/kernel/loongarch64/dcopy_lasx.S new file mode 100644 index 000000000..9d7da4a80 --- /dev/null +++ b/kernel/loongarch64/dcopy_lasx.S @@ -0,0 +1,224 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.d $f12, Y, 0 * SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 3 + add.d Y, Y, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.d $f12, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvst VX0, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bgez INCX, .L220 + .align 3 + +.L220: + bge $r0, I, .L223 + .align 3 + +.L222: + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.d a4, X, 0 * SIZE + add.d Y, Y, INCY + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.d a4, X, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dcopy_lsx.S b/kernel/loongarch64/dcopy_lsx.S new file mode 100644 index 000000000..161655bbd --- /dev/null +++ b/kernel/loongarch64/dcopy_lsx.S @@ -0,0 +1,232 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + addi.d I, I, -1 + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.d $f12, Y, 0 * SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.d $f12, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX0, Y, 0 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX1, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX0, Y, 4 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bgez INCX, .L220 + .align 3 + +.L220: + bge $r0, I, .L223 + .align 3 + +.L222: + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.d a4, X, 0 * SIZE + add.d Y, Y, INCY + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.d a4, X, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/scopy_lasx.S b/kernel/loongarch64/scopy_lasx.S new file mode 100644 index 000000000..7db1e7cee --- /dev/null +++ b/kernel/loongarch64/scopy_lasx.S @@ -0,0 +1,216 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.s $f12, Y, 0 * SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 7 + add.d Y, Y, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.s $f12, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvst VX0, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.s a4, X, 0 * SIZE + add.d Y, Y, INCY + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.s a4, X, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/scopy_lsx.S b/kernel/loongarch64/scopy_lsx.S new file mode 100644 index 000000000..32150d3d6 --- /dev/null +++ b/kernel/loongarch64/scopy_lsx.S @@ -0,0 +1,220 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.s $f12, Y, 0 * SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 3 + add.d Y, Y, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + fst.s $f12, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vst VX0, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.s a4, X, 0 * SIZE + add.d Y, Y, INCY + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s a3, X, 0 * SIZE + add.d Y, Y, INCY + fst.s a4, X, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 360acc0a41dd2ec3fc461bac390c74c958525ae8 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 12:57:05 +0800 Subject: [PATCH 471/718] loongarch64: Add optimizations for swap. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dswap_lasx.S | 301 +++++++++++++++++++++ kernel/loongarch64/dswap_lsx.S | 317 +++++++++++++++++++++++ kernel/loongarch64/sswap_lasx.S | 286 ++++++++++++++++++++ kernel/loongarch64/sswap_lsx.S | 294 +++++++++++++++++++++ 6 files changed, 1204 insertions(+) create mode 100644 kernel/loongarch64/dswap_lasx.S create mode 100644 kernel/loongarch64/dswap_lsx.S create mode 100644 kernel/loongarch64/sswap_lasx.S create mode 100644 kernel/loongarch64/sswap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 565bec0f2..879a6f68b 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -34,4 +34,7 @@ IDAMINKERNEL = idamin_lsx.S SCOPYKERNEL = scopy_lsx.S DCOPYKERNEL = dcopy_lsx.S +SSWAPKERNEL = sswap_lsx.S +DSWAPKERNEL = dswap_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index a2443720b..581cfdbbe 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -34,6 +34,9 @@ IDAMINKERNEL = idamin_lasx.S SCOPYKERNEL = scopy_lasx.S DCOPYKERNEL = dcopy_lasx.S +SSWAPKERNEL = sswap_lasx.S +DSWAPKERNEL = dswap_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dswap_lasx.S b/kernel/loongarch64/dswap_lasx.S new file mode 100644 index 000000000..221cb7fa2 --- /dev/null +++ b/kernel/loongarch64/dswap_lasx.S @@ -0,0 +1,301 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + addi.d I, I, -1 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvstelm.d VX0, Y, 0, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvstelm.d VX1, Y, 0, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + xvstelm.d VX2, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX2, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvstelm.d VX2, X, 0, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX3, X, 0, 0 + add.d X, X, INCY + ld.d t2, X, 0 * SIZE + xvstelm.d VX3, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX3, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvstelm.d VX3, X, 0, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bgez INCX, .L220 + //addi.d TEMP, N, -1 + //mul.d TEMP, TEMP, INCX + //sub.d X, X, TEMP + .align 3 + +.L220: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fld.d b1, Y, 0 * SIZE + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b2, Y, 0 * SIZE + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b3, Y, 0 * SIZE + fst.d a3, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b4, Y, 0 * SIZE + fst.d a4, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fst.d b1, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b1, Y, 0 * SIZE + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fst.d b2, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b2, Y, 0 * SIZE + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fst.d b3, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b3, Y, 0 * SIZE + fst.d a3, Y, 0 * SIZE + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d b4, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b4, Y, 0 * SIZE + fst.d a4, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d b1, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b2, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b3, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b4, XX, 0 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dswap_lsx.S b/kernel/loongarch64/dswap_lsx.S new file mode 100644 index 000000000..7f7f585e1 --- /dev/null +++ b/kernel/loongarch64/dswap_lsx.S @@ -0,0 +1,317 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + addi.d I, I, -1 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCY + vst VX0, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCY + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bgez INCX, .L220 + //addi.d TEMP, N, -1 + //mul.d TEMP, TEMP, INCX + //sub.d X, X, TEMP + .align 3 + +.L220: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fld.d b1, Y, 0 * SIZE + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b2, Y, 0 * SIZE + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b3, Y, 0 * SIZE + fst.d a3, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d b4, Y, 0 * SIZE + fst.d a4, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a1, X, 0 * SIZE + add.d X, X, INCX + fst.d b1, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b1, Y, 0 * SIZE + fst.d a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a2, X, 0 * SIZE + add.d X, X, INCX + fst.d b2, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b2, Y, 0 * SIZE + fst.d a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.d a3, X, 0 * SIZE + add.d X, X, INCX + fst.d b3, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b3, Y, 0 * SIZE + fst.d a3, Y, 0 * SIZE + fld.d a4, X, 0 * SIZE + add.d X, X, INCX + fst.d b4, XX, 0 * SIZE + add.d XX, XX, INCX + fld.d b4, Y, 0 * SIZE + fst.d a4, Y, 0 * SIZE + add.d Y, Y, INCY + fst.d b1, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b2, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b3, XX, 0 * SIZE + add.d XX, XX, INCX + fst.d b4, XX, 0 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.d $f12, Y, 0 * SIZE + fst.d $f14, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sswap_lasx.S b/kernel/loongarch64/sswap_lasx.S new file mode 100644 index 000000000..7184eff45 --- /dev/null +++ b/kernel/loongarch64/sswap_lasx.S @@ -0,0 +1,286 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvst VX2, X, 0 * SIZE + xvst VX0, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvstelm.w VX0, Y, 0, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + xvstelm.w VX2, X, 0, 1 + add.d X, X, INCY + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvstelm.w VX2, X, 0, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0, 4 + add.d X, X, INCY + ld.w t2, X, 0 * SIZE + xvstelm.w VX2, X, 0, 5 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0, 6 + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvstelm.w VX2, X, 0, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fld.s b1, Y, 0 * SIZE + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b2, Y, 0 * SIZE + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b3, Y, 0 * SIZE + fst.s a3, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b4, Y, 0 * SIZE + fst.s a4, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fst.s b1, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b1, Y, 0 * SIZE + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fst.s b2, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b2, Y, 0 * SIZE + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fst.s b3, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b3, Y, 0 * SIZE + fst.s a3, Y, 0 * SIZE + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s b4, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b4, Y, 0 * SIZE + fst.s a4, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s b1, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b2, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b3, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b4, XX, 0 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sswap_lsx.S b/kernel/loongarch64/sswap_lsx.S new file mode 100644 index 000000000..4f19a8024 --- /dev/null +++ b/kernel/loongarch64/sswap_lsx.S @@ -0,0 +1,294 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + addi.d I, I, -1 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vstelm.w VX0, Y, 0, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vstelm.w VX1, Y, 0, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + vstelm.w VX2, X, 0, 1 + add.d X, X, INCY + ld.w t3, X, 0 * SIZE + vstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vstelm.w VX2, X, 0, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX3, X, 0, 0 + add.d X, X, INCY + ld.w t2, X, 0 * SIZE + vstelm.w VX3, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX3, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vstelm.w VX3, X, 0, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fld.s b1, Y, 0 * SIZE + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b2, Y, 0 * SIZE + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b3, Y, 0 * SIZE + fst.s a3, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s b4, Y, 0 * SIZE + fst.s a4, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a1, X, 0 * SIZE + add.d X, X, INCX + fst.s b1, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b1, Y, 0 * SIZE + fst.s a1, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a2, X, 0 * SIZE + add.d X, X, INCX + fst.s b2, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b2, Y, 0 * SIZE + fst.s a2, Y, 0 * SIZE + add.d Y, Y, INCY + fld.s a3, X, 0 * SIZE + add.d X, X, INCX + fst.s b3, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b3, Y, 0 * SIZE + fst.s a3, Y, 0 * SIZE + fld.s a4, X, 0 * SIZE + add.d X, X, INCX + fst.s b4, XX, 0 * SIZE + add.d XX, XX, INCX + fld.s b4, Y, 0 * SIZE + fst.s a4, Y, 0 * SIZE + add.d Y, Y, INCY + fst.s b1, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b2, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b3, XX, 0 * SIZE + add.d XX, XX, INCX + fst.s b4, XX, 0 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fst.s $f12, Y, 0 * SIZE + fst.s $f14, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From d4c96a35a80da33c9d71fc5dd291f05f55900e27 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 13:02:03 +0800 Subject: [PATCH 472/718] loongarch64: Add optimizations for axpy and axpby. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 + kernel/loongarch64/daxpby_lasx.S | 629 ++++++++++++++++++++ kernel/loongarch64/daxpby_lsx.S | 693 +++++++++++++++++++++++ kernel/loongarch64/daxpy_lasx.S | 338 +++++++++++ kernel/loongarch64/daxpy_lsx.S | 365 ++++++++++++ kernel/loongarch64/saxpby_lasx.S | 597 +++++++++++++++++++ kernel/loongarch64/saxpby_lsx.S | 629 ++++++++++++++++++++ kernel/loongarch64/saxpy_lasx.S | 323 +++++++++++ kernel/loongarch64/saxpy_lsx.S | 338 +++++++++++ 10 files changed, 3924 insertions(+) create mode 100644 kernel/loongarch64/daxpby_lasx.S create mode 100644 kernel/loongarch64/daxpby_lsx.S create mode 100644 kernel/loongarch64/daxpy_lasx.S create mode 100644 kernel/loongarch64/daxpy_lsx.S create mode 100644 kernel/loongarch64/saxpby_lasx.S create mode 100644 kernel/loongarch64/saxpby_lsx.S create mode 100644 kernel/loongarch64/saxpy_lasx.S create mode 100644 kernel/loongarch64/saxpy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 879a6f68b..a94303151 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -37,4 +37,10 @@ DCOPYKERNEL = dcopy_lsx.S SSWAPKERNEL = sswap_lsx.S DSWAPKERNEL = dswap_lsx.S +SAXPYKERNEL = saxpy_lsx.S +DAXPYKERNEL = daxpy_lsx.S + +SAXPBYKERNEL = saxpby_lsx.S +DAXPBYKERNEL = daxpby_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 581cfdbbe..4cfd53058 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -37,6 +37,12 @@ DCOPYKERNEL = dcopy_lasx.S SSWAPKERNEL = sswap_lasx.S DSWAPKERNEL = dswap_lasx.S +SAXPYKERNEL = saxpy_lasx.S +DAXPYKERNEL = daxpy_lasx.S + +SAXPBYKERNEL = saxpby_lasx.S +DAXPBYKERNEL = daxpby_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/daxpby_lasx.S b/kernel/loongarch64/daxpby_lasx.S new file mode 100644 index 000000000..4b19703e7 --- /dev/null +++ b/kernel/loongarch64/daxpby_lasx.S @@ -0,0 +1,629 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 +#define VXB $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + xvreplgr2vr.d VXA, t1 + movfr2gr.d t2, BETA + xvreplgr2vr.d VXB, t2 + movfr2gr.d t3, a1 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX2, VX2, VXB, VX0 + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX2, VX2, VXB + xvfmul.d VX3, VX3, VXB + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VX0, VXA + xvld VX1, X, 4 * SIZE + xvfmadd.d VX2, VX2, VXB, VX0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX0, VX0, VXA + xvfmul.d VX1, VX1, VXA + xvstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX2, VX2, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VXA, VX0 + xvfmadd.d VX2, VX2, VXB, VX0 + xvld VX3, Y, 4 * SIZE + xvst VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VXA, VX0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX0, Y, 0 * SIZE + xvfmul.d VX1, VX1, VXA + addi.d I, I, -1 + xvst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VX2, VX2, VXB + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VX0, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvfmadd.d VX2, VX2, VXB, VX0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX1, VX1, VXA + xvfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VX0, VX0, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX0, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX1, VX1, VXA + addi.d I, I, -1 + xvstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX2, VX2, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + xvfmul.d VX3, VX3, VXB + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f12, X, 0 * SIZE + fld.d $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.d $f12, $f12, ALPHA + fmadd.d $f13, $f13, BETA, $f12 + fst.d $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpby_lsx.S b/kernel/loongarch64/daxpby_lsx.S new file mode 100644 index 000000000..9aafbaf2a --- /dev/null +++ b/kernel/loongarch64/daxpby_lsx.S @@ -0,0 +1,693 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 +#define VXB $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + vreplgr2vr.d VXA, t1 + movfr2gr.d t2, BETA + vreplgr2vr.d VXB, t2 + movfr2gr.d t3, a1 + vreplgr2vr.d VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vfmadd.d VX2, VX2, VXB, VX0 + vfmadd.d VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vfmadd.d VX2, VX2, VXB, VX0 + vfmadd.d VX3, VX3, VXB, VX1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX2, VX2, VXA + vfmul.d VX3, VX3, VXA + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0\ + vld VX0, Y, 0 * SIZE + vld VX1, Y, 2 * SIZE + vfmul.d VX0, VX0, VXB + vfmul.d VX1, VX1, VXB + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vfmul.d VX0, VX0, VXA + vld VX1, X, 2 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + vld VX0, X, 4 * SIZE + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX0, VX0, VXA + vld VX1, X, 6 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX0, VX0, VXA + vfmul.d VX1, VX1, VXA + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VXA, VX0 + vld VX3, Y, 2 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX2, Y, 0 * SIZE + vfmul.d VX1, VXA, VX1 + vld VX2, Y, 4 * SIZE + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX3, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + vld VX3, Y, 6 * SIZE + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX2, Y, 4 * SIZE + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VXA, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vfmul.d VX1, VXA, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vst VX1, Y, 2 * SIZE + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vfmul.d VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VX2, VX2, VXB + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.d $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VX0, VXA + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VX0, VXA + vfmadd.d VX2, VX2, VXB, VX0 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VX1, VX1, VXA + vfmadd.d VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX0, VX0, VXA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX0, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX1, VX1, VXA + addi.d I, I, -1 + vstelm.d VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX2, VX2, VXB + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + vfmul.d VX3, VX3, VXB + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f12, X, 0 * SIZE + fld.d $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.d $f12, $f12, ALPHA + fmadd.d $f13, $f13, BETA, $f12 + fst.d $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpy_lasx.S b/kernel/loongarch64/daxpy_lasx.S new file mode 100644 index 000000000..bafd871ab --- /dev/null +++ b/kernel/loongarch64/daxpy_lasx.S @@ -0,0 +1,338 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + xvreplgr2vr.d VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.d $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfadd.d VX2, VX0, VX2 + xvfadd.d VX3, VX1, VX3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmadd.d VX2, VX0, VXA, VX2 + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX2, VX0, VXA, VX2 + xvld VX1, X, 4 * SIZE + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmadd.d VX2, VX0, VXA, VX2 + xvld VX3, Y, 4 * SIZE + xvst VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX2, VX0, VXA, VX2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + xvstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + b .L999 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/daxpy_lsx.S b/kernel/loongarch64/daxpy_lsx.S new file mode 100644 index 000000000..fc88f0bb9 --- /dev/null +++ b/kernel/loongarch64/daxpy_lsx.S @@ -0,0 +1,365 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + movgr2fr.d a2, TEMP + ffint.d.l a2, a2 + fcmp.ceq.d $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, ALPHA + vreplgr2vr.d VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.d $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfadd.d VX2, VX0, VX2 + vfadd.d VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfadd.d VX2, VX0, VX2 + vfadd.d VX3, VX1, VX3 + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmadd.d VX2, VX0, VXA, VX2 + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + vfmadd.d VX2, VX0, VXA, VX2 + vfmadd.d VX3, VX1, VXA, VX3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + vld VX1, X, 2 * SIZE + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + vld VX0, X, 4 * SIZE + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + vld VX1, X, 6 * SIZE + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmadd.d VX2, VX0, VXA, VX2 + vld VX3, Y, 2 * SIZE + vst VX2, Y, 0 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmadd.d VX3, VX1, VXA, VX3 + vld VX2, Y, 4 * SIZE + vst VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmadd.d VX2, VX0, VXA, VX2 + vld VX3, Y, 6 * SIZE + vst VX2, Y, 4 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX2, VX0, VXA, VX2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX2, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmadd.d VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.d VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VX3, YY, 0, 1 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.d $f12, X, 0 * SIZE + fld.d $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.d $f14, $f12, $f0, $f14 + fst.d $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpby_lasx.S b/kernel/loongarch64/saxpby_lasx.S new file mode 100644 index 000000000..c5d1ff402 --- /dev/null +++ b/kernel/loongarch64/saxpby_lasx.S @@ -0,0 +1,597 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 +#define VXB $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + xvreplgr2vr.w VXA, t1 + movfr2gr.s t2, BETA + xvreplgr2vr.w VXB, t2 + movfr2gr.s t3, a1 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvfmadd.s VX2, VX2, VXB, VX0 + xvst VX2, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + xvld VX0, X, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + xvld VX2, Y, 0 * SIZE + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + xvst VXZ, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f12, X, 0 * SIZE + fld.s $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.s $f12, $f12, ALPHA + fmadd.s $f13, $f13, BETA, $f12 + fst.s $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpby_lsx.S b/kernel/loongarch64/saxpby_lsx.S new file mode 100644 index 000000000..7f8cea2dd --- /dev/null +++ b/kernel/loongarch64/saxpby_lsx.S @@ -0,0 +1,629 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHA $f0 +#define X $r5 +#define INCX $r6 +#define BETA $f1 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 +#define VXB $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + vreplgr2vr.w VXA, t1 + movfr2gr.s t2, BETA + vreplgr2vr.w VXB, t2 + movfr2gr.s t3, a1 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L110 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 + b .L111 // ALPHA!=0 BETA!=0 + .align 3 + +.L110: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L114 // ALPHA==0 BETA==0 + b .L113 // ALPHA==0 BETA!=0 + .align 3 + +.L111: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vfmadd.s VX2, VX2, VXB, VX0 + vfmadd.s VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L120 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 + b .L121 // ALPHA!=0 BETA!=0 + .align 3 + +.L120: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L124 // ALPHA==0 BETA==0 + b .L123 // ALPHA==0 BETA!=0 + .align 3 + +.L121: // ALPHA!=0 BETA!=0 + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + vld VX1, X, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // ALPHA!=0 BETA==0 + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // ALPHA==0 BETA==0 + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L210 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 + b .L211 // ALPHA!=0 BETA!=0 + .align 3 + +.L210: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L214 // ALPHA==0 BETA==0 + b .L213 // ALPHA==0 BETA!=0 + .align 3 + +.L211: // ALPHA!=0 BETA!=0 + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX2, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX0, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // ALPHA==0 BETA!=0 + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // ALPHA==0 BETA==0 + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L214 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L220 + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 + b .L221 // ALPHA!=0 BETA!=0 + .align 3 + +.L220: + fcmp.ceq.s $fcc0, BETA, a1 + bcnez $fcc0, .L224 // ALPHA==0 BETA==0 + b .L223 // ALPHA==0 BETA!=0 + .align 3 + +.L221: // ALPHA!=0 BETA!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vfmadd.s VX2, VX2, VXB, VX0 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vfmadd.s VX3, VX3, VXB, VX1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // ALPHA!=0 BETA==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // ALPHA==0 BETA!=0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // ALPHA==0 BETA==0 + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f12, X, 0 * SIZE + fld.s $f13, Y, 0 * SIZE + addi.d I, I, -1 + fmul.s $f12, $f12, ALPHA + fmadd.s $f13, $f13, BETA, $f12 + fst.s $f13, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpy_lasx.S b/kernel/loongarch64/saxpy_lasx.S new file mode 100644 index 000000000..609e26328 --- /dev/null +++ b/kernel/loongarch64/saxpy_lasx.S @@ -0,0 +1,323 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXA $xr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + movgr2fr.d a2, TEMP + ffint.s.l a2, a2 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + xvreplgr2vr.w VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.s $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfadd.s VX2, VX0, VX2 + xvst VX2, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfmadd.s VX2, VX0, VXA, VX2 + xvst VX2, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L112 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/saxpy_lsx.S b/kernel/loongarch64/saxpy_lsx.S new file mode 100644 index 000000000..f47415ed6 --- /dev/null +++ b/kernel/loongarch64/saxpy_lsx.S @@ -0,0 +1,338 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXA $vr23 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + movgr2fr.d a2, TEMP + ffint.s.l a2, a2 + fcmp.ceq.s $fcc0, ALPHA, a1 + bcnez $fcc0, .L999 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, ALPHA + vreplgr2vr.w VXA, t1 + + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L113 + fcmp.ceq.s $fcc0, ALPHA, a2 + bceqz $fcc0, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfadd.s VX2, VX0, VX2 + vfadd.s VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L113 + .align 3 + +.L112: + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX0, VXA, VX2 + vfmadd.s VX3, VX1, VXA, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L113 + .align 3 + +.L113: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L114: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L114 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + move YY, Y + .align 3 + +.L121: + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + vld VX1, X, 4 * SIZE + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmadd.s VX2, VX0, VXA, VX2 + vld VX3, Y, 4 * SIZE + vst VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + move YY, Y + .align 3 + +.L222: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + fld.s $f12, X, 0 * SIZE + fld.s $f14, Y, 0 * SIZE + addi.d I, I, -1 + fmadd.s $f14, $f12, $f0, $f14 + fst.s $f14, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From c80e7e27d1b70d6f9cf80bd64bde096d0c3359e3 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 13:08:03 +0800 Subject: [PATCH 473/718] loongarch64: Add optimizations for sum and asum. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 + kernel/loongarch64/dasum_lasx.S | 148 +++++++++++++++++++++ kernel/loongarch64/dasum_lsx.S | 158 +++++++++++++++++++++++ kernel/loongarch64/dsum_lasx.S | 125 ++++++++++++++++++ kernel/loongarch64/dsum_lsx.S | 123 ++++++++++++++++++ kernel/loongarch64/sasum_lasx.S | 157 ++++++++++++++++++++++ kernel/loongarch64/sasum_lsx.S | 148 +++++++++++++++++++++ kernel/loongarch64/ssum_lasx.S | 140 ++++++++++++++++++++ kernel/loongarch64/ssum_lsx.S | 125 ++++++++++++++++++ 10 files changed, 1136 insertions(+) create mode 100644 kernel/loongarch64/dasum_lasx.S create mode 100644 kernel/loongarch64/dasum_lsx.S create mode 100644 kernel/loongarch64/dsum_lasx.S create mode 100644 kernel/loongarch64/dsum_lsx.S create mode 100644 kernel/loongarch64/sasum_lasx.S create mode 100644 kernel/loongarch64/sasum_lsx.S create mode 100644 kernel/loongarch64/ssum_lasx.S create mode 100644 kernel/loongarch64/ssum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a94303151..5e83e67a4 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -43,4 +43,10 @@ DAXPYKERNEL = daxpy_lsx.S SAXPBYKERNEL = saxpby_lsx.S DAXPBYKERNEL = daxpby_lsx.S +SSUMKERNEL = ssum_lsx.S +DSUMKERNEL = dsum_lsx.S + +SASUMKERNEL = sasum_lsx.S +DASUMKERNEL = dasum_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 4cfd53058..2cc2edb74 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -43,6 +43,12 @@ DAXPYKERNEL = daxpy_lasx.S SAXPBYKERNEL = saxpby_lasx.S DAXPBYKERNEL = daxpby_lasx.S +SSUMKERNEL = ssum_lasx.S +DSUMKERNEL = dsum_lasx.S + +SASUMKERNEL = sasum_lasx.S +DASUMKERNEL = dasum_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dasum_lasx.S b/kernel/loongarch64/dasum_lasx.S new file mode 100644 index 000000000..49de98c40 --- /dev/null +++ b/kernel/loongarch64/dasum_lasx.S @@ -0,0 +1,148 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.d $f12, X, 0 * SIZE + fabs.d $f12, $f12 + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.d $f12, X, 0 * SIZE + fabs.d $f12, $f12 + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.d $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/dasum_lsx.S b/kernel/loongarch64/dasum_lsx.S new file mode 100644 index 000000000..94750815e --- /dev/null +++ b/kernel/loongarch64/dasum_lsx.S @@ -0,0 +1,158 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.d $f12, X, 0 * SIZE + fabs.d $f12, $f12 + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.d $f12, X, 0 * SIZE + fabs.d $f12, $f12 + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.d $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/dsum_lasx.S b/kernel/loongarch64/dsum_lasx.S new file mode 100644 index 000000000..3c51dab60 --- /dev/null +++ b/kernel/loongarch64/dsum_lasx.S @@ -0,0 +1,125 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.d $f12, X, 0 * SIZE + addi.d I, I, -1 + fadd.d $f16, $f12, $f16 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.d $f12, X, 0 * SIZE + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.d $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dsum_lsx.S b/kernel/loongarch64/dsum_lsx.S new file mode 100644 index 000000000..402d087df --- /dev/null +++ b/kernel/loongarch64/dsum_lsx.S @@ -0,0 +1,123 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.d $f12, X, 0 * SIZE + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.d $f12, X, 0 * SIZE + fadd.d $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.d $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sasum_lasx.S b/kernel/loongarch64/sasum_lasx.S new file mode 100644 index 000000000..a452701aa --- /dev/null +++ b/kernel/loongarch64/sasum_lasx.S @@ -0,0 +1,157 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.s $f12, X, 0 * SIZE + fabs.s $f12, $f12 + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s $f12, X, 0 * SIZE + fabs.s $f12, $f12 + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sasum_lsx.S b/kernel/loongarch64/sasum_lsx.S new file mode 100644 index 000000000..87026a144 --- /dev/null +++ b/kernel/loongarch64/sasum_lsx.S @@ -0,0 +1,148 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.s $f12, X, 0 * SIZE + fabs.s $f12, $f12 + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s $f12, X, 0 * SIZE + fabs.s $f12, $f12 + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ssum_lasx.S b/kernel/loongarch64/ssum_lasx.S new file mode 100644 index 000000000..7cf57bc77 --- /dev/null +++ b/kernel/loongarch64/ssum_lasx.S @@ -0,0 +1,140 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 * SIZE + xvfadd.s res1, VX0, res1 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.s $f12, X, 0 * SIZE + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfadd.s res1, VX0, res1 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s $f12, X, 0 * SIZE + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ssum_lsx.S b/kernel/loongarch64/ssum_lsx.S new file mode 100644 index 000000000..de63c69e3 --- /dev/null +++ b/kernel/loongarch64/ssum_lsx.S @@ -0,0 +1,125 @@ +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + fld.s $f12, X, 0 * SIZE + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s $f12, X, 0 * SIZE + fadd.s $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file From f9b468990e2ca2af5ea3f5848dc23152aa0da5bb Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 13:12:29 +0800 Subject: [PATCH 474/718] loongarch64: Add optimizations for rot. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/drot_lasx.S | 927 +++++++++++++++++++ kernel/loongarch64/drot_lsx.S | 1050 ++++++++++++++++++++++ kernel/loongarch64/srot_lasx.S | 863 ++++++++++++++++++ kernel/loongarch64/srot_lsx.S | 927 +++++++++++++++++++ 6 files changed, 3773 insertions(+) create mode 100644 kernel/loongarch64/drot_lasx.S create mode 100644 kernel/loongarch64/drot_lsx.S create mode 100644 kernel/loongarch64/srot_lasx.S create mode 100644 kernel/loongarch64/srot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 5e83e67a4..026ea0d77 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -49,4 +49,7 @@ DSUMKERNEL = dsum_lsx.S SASUMKERNEL = sasum_lsx.S DASUMKERNEL = dasum_lsx.S +SROTKERNEL = srot_lsx.S +DROTKERNEL = drot_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 2cc2edb74..4905a50a9 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -49,6 +49,9 @@ DSUMKERNEL = dsum_lasx.S SASUMKERNEL = sasum_lasx.S DASUMKERNEL = dasum_lasx.S +SROTKERNEL = srot_lasx.S +DROTKERNEL = drot_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/drot_lasx.S b/kernel/loongarch64/drot_lasx.S new file mode 100644 index 000000000..d3644b780 --- /dev/null +++ b/kernel/loongarch64/drot_lasx.S @@ -0,0 +1,927 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, C + xvreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + xvreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L110 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VT0, VX0, VXC + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX2, VXC, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX1, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VT0, VX0, VXC + xvfmul.d VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + xvfmul.d VT0, VX1, VXC + xvfmul.d VT1, VX3, VXC + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvfmul.d VT0, VX2, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + xvst VT0, X, 4 * SIZE + xvst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L120 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX2, VXC, VT1 + xvld VX1, X, 4 * SIZE + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX1, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + addi.d I, I, -1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmul.d VT1, VX2, VXC + xvld VX1, X, 4 * SIZE + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmul.d VT1, VX3, VXC + addi.d I, I, -1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX2, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + addi.d I, I, -1 + xvst VT0, X, 4 * SIZE + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, X, 4 * SIZE + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L210 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXC, VX0 + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VXS, VX0 + xvfmsub.d VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX1, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXC, VX0 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX2, VXC + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX1, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX3, VXS + xvst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VXS, VX2 + xvfmul.d VT1, VXS, VX0 + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX1, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L220 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmadd.d VT0, VX2, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmadd.d VT0, VX3, VXS, VT0 + xvfmul.d VT1, VX0, VXS + xvfmsub.d VT1, VX3, VXC, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX0, VXC + xvfmul.d VT1, VX2, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX1, VXC + xvfmul.d VT1, VX3, VXC + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX2, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvfmul.d VT0, VX3, VXS + xvfmul.d VT1, VX0, VXS + xvfsub.d VT1, VXZ, VT1 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f12, X, 0 * SIZE + fld.d $f13, Y, 0 * SIZE + fmul.d $f10, $f12, C + fmadd.d $f10, $f13, S, $f10 + fst.d $f10, X, 0 * SIZE + addi.d I, I, -1 + fmul.d $f20, $f12, S + fmsub.d $f20, $f13, C, $f20 + fst.d $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/drot_lsx.S b/kernel/loongarch64/drot_lsx.S new file mode 100644 index 000000000..6db803b1c --- /dev/null +++ b/kernel/loongarch64/drot_lsx.S @@ -0,0 +1,1050 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.d.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.d t1, C + vreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + vreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + vreplgr2vr.d VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L110 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, X, 6 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L120 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, X, 4 * SIZE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L210 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VXC, VX0 + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VXS, VX0 + vfmsub.d VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VXC, VX0 + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VXS, VX0 + vfmsub.d VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX1, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VXC, VX0 + vfmul.d VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VXC, VX0 + vfmul.d VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + vst VT1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VXS, VX2 + vfmul.d VT1, VXS, VX0 + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VXS, VX2 + vfmul.d VT1, VXS, VX0 + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX1, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.d $fcc0, C, a1 + bcnez $fcc0, .L220 + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + fcmp.ceq.d $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmadd.d VT0, VX2, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmadd.d VT0, VX3, VXS, VT0 + vfmul.d VT1, VX0, VXS + vfmsub.d VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX0, VXC + vfmul.d VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX1, VXC + vfmul.d VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX2, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vfmul.d VT0, VX3, VXS + vfmul.d VT1, VX0, VXS + vfsub.d VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f12, X, 0 * SIZE + fld.d $f13, Y, 0 * SIZE + fmul.d $f10, $f12, C + fmadd.d $f10, $f13, S, $f10 + fst.d $f10, X, 0 * SIZE + addi.d I, I, -1 + fmul.d $f20, $f12, S + fmsub.d $f20, $f13, C, $f20 + fst.d $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/srot_lasx.S b/kernel/loongarch64/srot_lasx.S new file mode 100644 index 000000000..9aeb4dcf5 --- /dev/null +++ b/kernel/loongarch64/srot_lasx.S @@ -0,0 +1,863 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, C + xvreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + xvreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L110 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvst VT1, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L120 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L210 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXS, VX2 + xvfmul.s VT1, VXS, VX0 + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L220 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f12, X, 0 * SIZE + fld.s $f13, Y, 0 * SIZE + fmul.s $f10, $f12, C + fmadd.s $f10, $f13, S, $f10 + fst.s $f10, X, 0 * SIZE + addi.d I, I, -1 + fmul.s $f20, $f12, S + fmsub.s $f20, $f13, C, $f20 + fst.s $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/srot_lsx.S b/kernel/loongarch64/srot_lsx.S new file mode 100644 index 000000000..8822b58e4 --- /dev/null +++ b/kernel/loongarch64/srot_lsx.S @@ -0,0 +1,927 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + ffint.s.l a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + movfr2gr.s t1, C + vreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + vreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L110 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VT0, VX0, VXC + vfmadd.s VT0, VX2, VXS, VT0 + vfmul.s VT1, VX0, VXS + vfmsub.s VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.s VT0, VX1, VXC + vfmadd.s VT0, VX3, VXS, VT0 + vfmul.s VT1, VX1, VXS + vfmsub.s VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VT0, VX0, VXC + vfmul.s VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.s VT0, VX1, VXC + vfmul.s VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VT0, VX2, VXS + vfmul.s VT1, VX0, VXS + vfsub.s VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + vfmul.s VT0, VX3, VXS + vfmul.s VT1, VX1, VXS + vfsub.s VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L120 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX0, VXC + vfmadd.s VT0, VX2, VXS, VT0 + vfmul.s VT1, VX0, VXS + vfmsub.s VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX1, VXC + vfmadd.s VT0, VX3, VXS, VT0 + vfmul.s VT1, VX1, VXS + vfmsub.s VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX0, VXC + vfmul.s VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX1, VXC + vfmul.s VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX2, VXS + vfmul.s VT1, VX0, VXS + vfsub.s VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX3, VXS + vfmul.s VT1, VX1, VXS + vfsub.s VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, X, 4 * SIZE + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L124 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L210 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VXC, VX0 + vfmadd.s VT0, VX2, VXS, VT0 + vfmul.s VT1, VXS, VX0 + vfmsub.s VT1, VX2, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VX1, VXC + vfmadd.s VT0, VX3, VXS, VT0 + vfmul.s VT1, VX1, VXS + vfmsub.s VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VXC, VX0 + vfmul.s VT1, VX2, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VX1, VXC + vfmul.s VT1, VX3, VXS + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VXS, VX2 + vfmul.s VT1, VXS, VX0 + vfsub.s VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmul.s VT0, VX3, VXS + vfmul.s VT1, VX1, VXS + vfsub.s VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + fcmp.ceq.s $fcc0, C, a1 + bcnez $fcc0, .L220 + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + fcmp.ceq.s $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX0, VXC + vfmadd.s VT0, VX2, VXS, VT0 + vfmul.s VT1, VX0, VXS + vfmsub.s VT1, VX2, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX1, VXC + vfmadd.s VT0, VX3, VXS, VT0 + vfmul.s VT1, VX0, VXS + vfmsub.s VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX0, VXC + vfmul.s VT1, VX2, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX1, VXC + vfmul.s VT1, VX3, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX2, VXS + vfmul.s VT1, VX0, VXS + vfsub.s VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VT0, VX3, VXS + vfmul.s VT1, VX0, VXS + vfsub.s VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f12, X, 0 * SIZE + fld.s $f13, Y, 0 * SIZE + fmul.s $f10, $f12, C + fmadd.s $f10, $f13, S, $f10 + fst.s $f10, X, 0 * SIZE + addi.d I, I, -1 + fmul.s $f20, $f12, S + fmsub.s $f20, $f13, C, $f20 + fst.s $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE \ No newline at end of file From d32f38fb37c94afae20a662182fe7178c005a673 Mon Sep 17 00:00:00 2001 From: yancheng Date: Thu, 7 Dec 2023 13:15:55 +0800 Subject: [PATCH 475/718] loongarch64: Add optimizations for nrm2. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/dnrm2_lasx.S | 233 ++++++++++++++++++++++ kernel/loongarch64/dnrm2_lsx.S | 242 +++++++++++++++++++++++ kernel/loongarch64/snrm2_lasx.S | 143 ++++++++++++++ kernel/loongarch64/snrm2_lsx.S | 156 +++++++++++++++ 6 files changed, 780 insertions(+) create mode 100644 kernel/loongarch64/dnrm2_lasx.S create mode 100644 kernel/loongarch64/dnrm2_lsx.S create mode 100644 kernel/loongarch64/snrm2_lasx.S create mode 100644 kernel/loongarch64/snrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 026ea0d77..1e4fa7a9d 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -52,4 +52,7 @@ DASUMKERNEL = dasum_lsx.S SROTKERNEL = srot_lsx.S DROTKERNEL = drot_lsx.S +SNRM2KERNEL = snrm2_lsx.S +DNRM2KERNEL = dnrm2_lsx.S + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 4905a50a9..f00abcb32 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -52,6 +52,9 @@ DASUMKERNEL = dasum_lasx.S SROTKERNEL = srot_lasx.S DROTKERNEL = drot_lasx.S +SNRM2KERNEL = snrm2_lasx.S +DNRM2KERNEL = dnrm2_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S new file mode 100644 index 000000000..2a9c3cf7b --- /dev/null +++ b/kernel/loongarch64/dnrm2_lasx.S @@ -0,0 +1,233 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + xvld VM0, X, 0 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + move TEMP, X // initialize the maxa value + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L97 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t3, 2 + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t3, 2 + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM2, VM3, VM0 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L97: + andi I, N, 7 + bge $r0, I, .L99 + .align 3 + +.L98: + xvld VX1, X, 0 + xvfmaxa.d VM0, VM0, VX1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, SIZE + bne INCX, TEMP, .L120 + srai.d I, N, 3 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 * SIZE + xvld VX1, XX, 4 * SIZE + xvfmul.d VM0, VX0, VALPHA + xvfmul.d VM1, VX1, VALPHA + xvfmadd.d res1, VM0, VM0, res1 + xvfmadd.d res2, VM1, VM1, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + srai.d I, N, 3 + bge $r0, I, .L997 + +.L121: + ld.d t1, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t2, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t4, XX, 0 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t2, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t4, XX, 0 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM0, VX0, VALPHA + xvfmul.d VM1, VX1, VALPHA + xvfmadd.d res1, VM0, VM0, res1 + xvfmadd.d res2, VM1, VM1, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM0, res1, 3 + xvfadd.d res1, VX0, res1 + xvfadd.d VX1, VX1, VM0 + xvfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f15, XX, 0 * SIZE + addi.d I, I, -1 + fmul.d $f15, $f15, ALPHA + fmadd.d $f19, $f15, $f15, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + fmov.d $f0, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S new file mode 100644 index 000000000..e4615e18d --- /dev/null +++ b/kernel/loongarch64/dnrm2_lsx.S @@ -0,0 +1,242 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + vld VM0, X, 0 + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + move TEMP, X // initialize the maxa value + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L97 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 7 + bge $r0, I, .L99 + .align 3 + +.L98: + vld VX1, X, 0 + vfmaxa.d VM0, VM0, VX1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, SIZE + bne INCX, TEMP, .L120 + srai.d I, N, 3 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM0, VX0, VALPHA + vfmul.d VM1, VX1, VALPHA + vfmadd.d res1, VM0, VM0, res1 + vfmadd.d res2, VM1, VM1, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM0, VX0, VALPHA + vfmul.d VM1, VX1, VALPHA + vfmadd.d res1, VM0, VM0, res1 + vfmadd.d res2, VM1, VM1, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + srai.d I, N, 3 + bge $r0, I, .L997 + +.L121: + ld.d t1, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t2, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t4, XX, 0 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM0, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + add.d XX, XX, INCX + vfmul.d VM1, VX1, VALPHA + ld.d t2, XX, 0 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM0, VM0, res1 + vfmadd.d res2, VM1, VM1, res2 + ld.d t3, XX, 0 * SIZE + add.d XX, XX, INCX + ld.d t4, XX, 0 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM0, VX0, VALPHA + vfmul.d VM1, VX1, VALPHA + vfmadd.d res1, VM0, VM0, res1 + vfmadd.d res2, VM1, VM1, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d $f15, XX, 0 * SIZE + addi.d I, I, -1 + fmul.d $f15, $f15, ALPHA + fmadd.d $f19, $f15, $f15, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + fmov.d $f0, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S new file mode 100644 index 000000000..274908c14 --- /dev/null +++ b/kernel/loongarch64/snrm2_lasx.S @@ -0,0 +1,143 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 0 * SIZE + xvfcvtl.d.s VX0, VX0 + xvfcvth.d.s VX1, VX1 + xvfmadd.d res1, VX0, VX0, res1 + xvfmadd.d res2, VX1, VX1, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfcvtl.d.s VX0, VX0 + xvfcvth.d.s VX1, VX1 + xvfmadd.d res1, VX0, VX0, res1 + xvfmadd.d res2, VX1, VX1, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f15, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s $f15, $f15 + fmadd.d $f19, $f15, $f15, $f19 + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S new file mode 100644 index 000000000..17d017900 --- /dev/null +++ b/kernel/loongarch64/snrm2_lsx.S @@ -0,0 +1,156 @@ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 0 * SIZE + vfcvtl.d.s VX0, VX0 + vfcvth.d.s VX1, VX1 + vfmadd.d res1, VX0, VX0, res1 + vfmadd.d res2, VX1, VX1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 4 * SIZE + vfcvtl.d.s VX2, VX2 + vfcvth.d.s VX3, VX3 + vfmadd.d res1, VX2, VX2, res1 + vfmadd.d res2, VX3, VX3, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L996 + .align 3 + + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfcvtl.d.s VX0, VX0 + vfcvth.d.s VX1, VX1 + vfmadd.d res1, VX0, VX0, res1 + vfmadd.d res2, VX1, VX1, res2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfcvtl.d.s VX2, VX2 + vfcvth.d.s VX3, VX3 + vfmadd.d res1, VX2, VX2, res1 + vfmadd.d res2, VX3, VX3, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s $f15, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s $f15, $f15 + fmadd.d $f19, $f15, $f15, $f19 + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From edac80d7e8ba97e39002f223628a956456356fa9 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Thu, 7 Dec 2023 14:59:27 -0800 Subject: [PATCH 476/718] some cleanup, dynamically scale threads, add missing WIN_CASE defn --- driver/others/blas_server_win32.c | 70 ++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 5af1f1a51..40ff85abc 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -51,15 +51,6 @@ /* This is a thread implementation for Win32 lazy implementation */ /* Thread server common information */ -//typedef struct{ -// CRITICAL_SECTION lock; -// HANDLE filled; -// HANDLE killed; -// -// blas_queue_t *queue; /* Parameter Pointer */ -// int shutdown; /* server shutdown flag */ -// -//} blas_pool_t; static blas_queue_t *work_queue = NULL; static HANDLE kickoff_event = NULL; @@ -71,11 +62,19 @@ int blas_server_avail = 0; /* Local Variables */ static BLASULONG server_lock = 0; -//static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; +static volatile int thread_target; // target num of live threads, volatile for cross-thread reads - +#if defined (__GNUC__) && (__GNUC__ < 6) + #define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) +#else + #if defined(_WIN64) + #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) + #else + #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) + #endif +#endif static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ @@ -206,14 +205,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ static DWORD WINAPI blas_thread_server(void *arg){ /* Thread identifier */ -#ifdef SMP_DEBUG BLASLONG cpu = (BLASLONG)arg; -#endif void *buffer, *sa, *sb; blas_queue_t *queue; - DWORD action; - //HANDLE handles[] = {pool.filled, pool.killed}; /* Each server needs each buffer */ buffer = blas_memory_alloc(2); @@ -232,6 +227,12 @@ static DWORD WINAPI blas_thread_server(void *arg){ // event raised when work is added to the queue WaitForSingleObject(kickoff_event, INFINITE); + if (cpu > thread_target - 2) + { + //printf("thread [%d] exiting.\n", cpu); + break; // excess thread, so worker thread exits + } + #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); #endif @@ -245,17 +246,17 @@ static DWORD WINAPI blas_thread_server(void *arg){ LeaveCriticalSection(&queue_lock); #else - volatile work_queue_t* queue_next; + volatile blas_queue_t* queue_next; INT_PTR prev_value; do { - queue = (volatile work_queue_t*)work_queue; + queue = (volatile blas_queue_t*)work_queue; if (!queue) break; - queue_next = (volatile work_queue_t*)queue->next; + queue_next = (volatile blas_queue_t*)queue->next; prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue); - } while (prev_value != work_item); + } while (prev_value != queue); #endif if (queue) { @@ -377,9 +378,13 @@ int blas_thread_init(void){ // create the kickoff Event kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); + thread_target = blas_cpu_number; + InitializeCriticalSection(&queue_lock); for(i = 0; i < blas_cpu_number - 1; i++){ + //printf("thread_init: creating thread [%d]\n", i); + blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, 0, &blas_threads_id[i]); @@ -564,10 +569,36 @@ void goto_set_num_threads(int num_threads) if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; + if (blas_server_avail && num_threads < blas_num_threads) { + LOCK_COMMAND(&server_lock); + + thread_target = num_threads; + + SetEvent(kickoff_event); + + for (i = num_threads - 1; i < blas_num_threads - 1; i++) { + //printf("set_num_threads: waiting on thread [%d] to quit.\n", i); + + WaitForSingleObject(blas_threads[i], INFINITE); + + //printf("set_num_threads: thread [%d] has quit.\n", i); + + CloseHandle(blas_threads[i]); + } + + blas_num_threads = num_threads; + + ResetEvent(kickoff_event); + + UNLOCK_COMMAND(&server_lock); + } + if (num_threads > blas_num_threads) { LOCK_COMMAND(&server_lock); + thread_target = num_threads; + //increased_threads = 1; if (!blas_server_avail){ // create the kickoff Event @@ -579,6 +610,7 @@ void goto_set_num_threads(int num_threads) } for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ + //printf("set_num_threads: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, From 4e738e561a95707e5ae41465af4896b74b2ca138 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 8 Dec 2023 12:36:08 -0600 Subject: [PATCH 477/718] Replace two vector loads with one vector pair load and fix endianess of stores. --- kernel/power/sgemm_tcopy_16_power8.S | 3 + kernel/power/sgemm_tcopy_macros_16_power10.S | 323 +++++++++++++++++++ kernel/power/sgemm_tcopy_macros_16_power8.S | 6 + 3 files changed, 332 insertions(+) create mode 100644 kernel/power/sgemm_tcopy_macros_16_power10.S diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S index b9f6d63fb..6d2c6a555 100644 --- a/kernel/power/sgemm_tcopy_16_power8.S +++ b/kernel/power/sgemm_tcopy_16_power8.S @@ -108,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#ifdef POWER10 +#include "sgemm_tcopy_macros_16_power10.S" +#endif #include "sgemm_tcopy_macros_16_power8.S" #define STACKSIZE 144 diff --git a/kernel/power/sgemm_tcopy_macros_16_power10.S b/kernel/power/sgemm_tcopy_macros_16_power10.S new file mode 100644 index 000000000..dca37e48a --- /dev/null +++ b/kernel/power/sgemm_tcopy_macros_16_power10.S @@ -0,0 +1,323 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x16', ` +#else +.macro COPY_4x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + + lxvpx vs40, o0, A2 + lxvpx vs42, o32, A2 + + lxvpx vs44, o0, A3 + lxvpx vs46, o32, A3 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs40, o0, T1 + stxvx vs41, o16, T1 + stxvx vs42, o32, T1 + stxvx vs43, o48, T1 +#else + stxvx vs41, o0, T1 + stxvx vs40, o16, T1 + stxvx vs43, o32, T1 + stxvx vs42, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs44, o0, T1 + stxvx vs45, o16, T1 + stxvx vs46, o32, T1 + stxvx vs47, o48, T1 +#else + stxvx vs45, o0, T1 + stxvx vs44, o16, T1 + stxvx vs47, o32, T1 + stxvx vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x8', ` +#else +.macro COPY_4x8 +#endif + + lxvpx vs32, o0, A0 + + lxvpx vs34, o0, A1 + + lxvpx vs36, o0, A2 + + lxvpx vs38, o0, A3 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x16', ` +#else +.macro COPY_2x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs36, o0, T1 + stxvx vs37, o16, T1 + stxvx vs38, o32, T1 + stxvx vs39, o48, T1 +#else + stxvx vs37, o0, T1 + stxvx vs36, o16, T1 + stxvx vs39, o32, T1 + stxvx vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x8', ` +#else +.macro COPY_2x8 +#endif + + lxvpx vs32, o0, A0 + + lxvpx vs34, o0, A1 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x16', ` +#else +.macro COPY_1x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 + stxvx vs34, o32, T1 + stxvx vs35, o48, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 + stxvx vs35, o32, T1 + stxvx vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x8', ` +#else +.macro COPY_1x8 +#endif + + lxvpx vs32, o0, A0 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvx vs32, o0, T1 + stxvx vs33, o16, T1 +#else + stxvx vs33, o0, T1 + stxvx vs32, o16, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S index ed592a604..af237d5ee 100644 --- a/kernel/power/sgemm_tcopy_macros_16_power8.S +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_4x16', ` #else @@ -141,6 +142,7 @@ define(`COPY_4x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=4 and M=4 @@ -264,6 +266,7 @@ define(`COPY_4x1', ` * Macros for N=2 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_2x16', ` #else @@ -329,6 +332,7 @@ define(`COPY_2x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=2 and M=4 @@ -418,6 +422,7 @@ define(`COPY_2x1', ` * Macros for N=1 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_1x16', ` #else @@ -465,6 +470,7 @@ define(`COPY_1x8', ` #else .endm #endif +#endif /********************************************************************************************** * Macros for N=1 and M=4 From c732f275a27cbd9044d8409c2dd13e1e32f675ca Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Mon, 11 Dec 2023 21:05:31 +0800 Subject: [PATCH 478/718] system_check.cmake: fix arch detection for Darwin PowerPC --- cmake/system_check.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index c9671b379..e94497a04 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -38,7 +38,7 @@ if(CMAKE_CL_64 OR MINGW64) endif() elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*")) set(POWER 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) @@ -109,7 +109,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64) + if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc"))) set(BINARY 64) else () set(BINARY 32) From 9dbc8129b3987fd038585904e612a8ff0f62f947 Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Mon, 11 Dec 2023 21:09:06 +0800 Subject: [PATCH 479/718] cpuid_power.c: add CPU_SUBTYPE_POWERPC_7400 case --- cpuid_power.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_power.c b/cpuid_power.c index 2526e8d0e..1ced8930a 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -160,6 +160,7 @@ int detect(void){ infoCount = HOST_BASIC_INFO_COUNT; host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount); + if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970; From 330101e0b38b95337818a27b29300f1637c1cce5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 11 Dec 2023 21:52:00 +0100 Subject: [PATCH 480/718] Add complex type definitions for MSVC --- lapack-netlib/LAPACKE/include/lapack.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 28f8ad655..dac6b22be 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -37,7 +37,15 @@ */ #ifndef LAPACK_COMPLEX_CUSTOM +#if defined(_MSC_VER) + #define _CRT_USE_C_COMPLEX_H + #include + #define LAPACK_COMPLEX_CUSTOM + #define lapack_complex_float _Fcomplex + #define lapack_complex_double _Dcomplex +#endif +#else /* Complex type (single precision) */ #ifndef lapack_complex_float #ifndef __cplusplus @@ -74,6 +82,7 @@ #define lapack_complex_double_imag(z) (cimag(z)) #endif +#endif #endif /* LAPACK_COMPLEX_CUSTOM */ From 6bd7c54af5ecc2004b8a6df0157fe72d55530927 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Mon, 11 Dec 2023 15:13:04 -0800 Subject: [PATCH 481/718] introduce MT_TRACE to clean up SMP_DEBUG code --- driver/others/blas_server_win32.c | 60 ++++++++++++------------------- 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 40ff85abc..5820a55f4 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -48,6 +48,12 @@ #endif #endif +#ifdef SMP_DEBUG +# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__) +#else +# define MT_TRACE(...) +#endif + /* This is a thread implementation for Win32 lazy implementation */ /* Thread server common information */ @@ -213,29 +219,24 @@ static DWORD WINAPI blas_thread_server(void *arg){ /* Each server needs each buffer */ buffer = blas_memory_alloc(2); -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); -#endif + MT_TRACE("Server[%2ld] Thread is started!\n", cpu); while (1){ /* Waiting for Queue */ -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); -#endif + MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); + // event raised when work is added to the queue WaitForSingleObject(kickoff_event, INFINITE); if (cpu > thread_target - 2) { - //printf("thread [%d] exiting.\n", cpu); + //MT_TRACE("thread [%d] exiting.\n", cpu); break; // excess thread, so worker thread exits } -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); -#endif + MT_TRACE("Server[%2ld] Got it.\n", cpu); #if 1 EnterCriticalSection(&queue_lock); @@ -270,10 +271,8 @@ static DWORD WINAPI blas_thread_server(void *arg){ __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); #endif -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", + MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); -#endif // fprintf(stderr, "queue start[%ld]!!!\n", cpu); @@ -342,19 +341,14 @@ static DWORD WINAPI blas_thread_server(void *arg){ continue; //if queue == NULL } -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); -#endif + MT_TRACE("Server[%2ld] Finished!\n", cpu); queue->finished = 1; - } /* Shutdown procedure */ -#ifdef SMP_DEBUG - fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); -#endif + MT_TRACE("Server[%2ld] Shutdown!\n", cpu); blas_memory_free(buffer); @@ -369,10 +363,7 @@ int blas_thread_init(void){ LOCK_COMMAND(&server_lock); -#ifdef SMP_DEBUG - fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", - blas_cpu_number); -#endif + MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); if (!blas_server_avail){ // create the kickoff Event @@ -383,7 +374,7 @@ int blas_thread_init(void){ InitializeCriticalSection(&queue_lock); for(i = 0; i < blas_cpu_number - 1; i++){ - //printf("thread_init: creating thread [%d]\n", i); + //MT_TRACE("thread_init: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, @@ -458,14 +449,10 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ -#ifdef SMP_DEBUG - fprintf(STDERR, "Synchronization Waiting.\n"); -#endif + MT_TRACE("Synchronization Waiting.\n"); while (num){ -#ifdef SMP_DEBUG - fprintf(STDERR, "Waiting Queue ..\n"); -#endif + MT_TRACE("Waiting Queue ..\n"); while (!queue->finished) YIELDING; @@ -473,9 +460,8 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ num--; } -#ifdef SMP_DEBUG - fprintf(STDERR, "Completely Done.\n\n"); -#endif + MT_TRACE("Completely Done.\n\n"); + // if work was added to the queue after this batch we can't sleep the worker threads // by resetting the event EnterCriticalSection(&queue_lock); @@ -577,11 +563,11 @@ void goto_set_num_threads(int num_threads) SetEvent(kickoff_event); for (i = num_threads - 1; i < blas_num_threads - 1; i++) { - //printf("set_num_threads: waiting on thread [%d] to quit.\n", i); + //MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i); WaitForSingleObject(blas_threads[i], INFINITE); - //printf("set_num_threads: thread [%d] has quit.\n", i); + //MT_TRACE("set_num_threads: thread [%d] has quit.\n", i); CloseHandle(blas_threads[i]); } @@ -610,7 +596,7 @@ void goto_set_num_threads(int num_threads) } for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ - //printf("set_num_threads: creating thread [%d]\n", i); + //MT_TRACE("set_num_threads: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, From dcf6999c4e3b6af70d2822b9c2629e2df91378e4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Dec 2023 11:27:17 +0100 Subject: [PATCH 482/718] remove extraneous endif --- lapack-netlib/LAPACKE/include/lapack.h | 1 - 1 file changed, 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index dac6b22be..b9f81792d 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -43,7 +43,6 @@ #define LAPACK_COMPLEX_CUSTOM #define lapack_complex_float _Fcomplex #define lapack_complex_double _Dcomplex -#endif #else /* Complex type (single precision) */ From dcdc35127276b4a261292bf03ca8570e89ee105a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Dec 2023 23:06:22 +0100 Subject: [PATCH 483/718] Add MSVC-compatible complex types --- lapack-netlib/LAPACKE/include/lapacke_config.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index c64fc4416..798a5eb2e 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -68,6 +68,17 @@ extern "C" { #endif #ifndef LAPACK_COMPLEX_CUSTOM +#if defined(_MSC_VER) + #define _CRT_USE_C_COMPLEX_H + #include + #define LAPACK_COMPLEX_CUSTOM + #define lapack_complex_float _Fcomplex + #define lapack_complex_double _Dcomplex + #define lapack_complex_float_real(z) (creal(z)) + #define lapack_complex_float_imag(z) (cimag(z)) + #define lapack_complex_double_real(z) (creal(z)) + #define lapack_complex_double_imag(z) (cimag(z)) +#else #if defined(LAPACK_COMPLEX_STRUCTURE) @@ -109,6 +120,7 @@ typedef struct { double real, imag; } _lapack_complex_double; #define lapack_complex_double_real(z) (creal(z)) #define lapack_complex_double_imag(z) (cimag(z)) +#endif #endif lapack_complex_float lapack_make_complex_float( float re, float im ); From aa46f1e4e721550a13524ee890c0d26077c0a9eb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Dec 2023 23:07:48 +0100 Subject: [PATCH 484/718] revert addition of MSVC-compatible complex (moved to lapacke_config.h) --- lapack-netlib/LAPACKE/include/lapack.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index b9f81792d..28f8ad655 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -37,14 +37,7 @@ */ #ifndef LAPACK_COMPLEX_CUSTOM -#if defined(_MSC_VER) - #define _CRT_USE_C_COMPLEX_H - #include - #define LAPACK_COMPLEX_CUSTOM - #define lapack_complex_float _Fcomplex - #define lapack_complex_double _Dcomplex -#else /* Complex type (single precision) */ #ifndef lapack_complex_float #ifndef __cplusplus @@ -81,7 +74,6 @@ #define lapack_complex_double_imag(z) (cimag(z)) #endif -#endif #endif /* LAPACK_COMPLEX_CUSTOM */ From a8d3619f65e65cb5a7e24f148a4924339a1a702f Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Wed, 13 Dec 2023 19:42:56 +0800 Subject: [PATCH 485/718] cc.cmake: add optflags for G5 and G4 kernels --- cmake/cc.cmake | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 7b4ef8947..ac8661a7b 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -282,6 +282,18 @@ if (${CORE} STREQUAL POWER8) endif () endif () +if (${CORE} STREQUAL PPC970) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=970 -mtune=970 -maltivec -fno-fast-math") + endif () +endif () + +if (${CORE} STREQUAL PPCG4) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=G4 -mtune=G4 -maltivec -fno-fast-math") + endif () +endif () + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") From d9653af01851f52470f7b65d8650f6f2f5431b6e Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Wed, 13 Dec 2023 19:23:50 +0800 Subject: [PATCH 486/718] KERNEL.PPC970, KERNEL.PPCG4: unbreak CMake parsing Fixes: https://github.com/OpenMathLib/OpenBLAS/issues/4366 --- kernel/power/KERNEL.PPC970 | 8 ++++---- kernel/power/KERNEL.PPCG4 | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index a99fb7d96..fee5fa529 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -1,11 +1,11 @@ ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) SGEMMKERNEL = gemm_kernel.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) else diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index 1bdd3119e..c73601cee 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -96,9 +96,9 @@ CGEMMINCOPY = CGEMMONCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = +CGEMMINCOPYOBJ = #cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = #cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) From 981e315b30bb50cbb2b7375665f0f11d0d567703 Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Thu, 14 Dec 2023 12:01:31 +0800 Subject: [PATCH 487/718] cc.cmake: use -force_cpusubtype_ALL for Darwin PPC --- cmake/cc.cmake | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index ac8661a7b..d5bf3b7ee 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -286,12 +286,18 @@ if (${CORE} STREQUAL PPC970) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=970 -mtune=970 -maltivec -fno-fast-math") endif () + if (APPLE) + set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") + endif () endif () if (${CORE} STREQUAL PPCG4) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=G4 -mtune=G4 -maltivec -fno-fast-math") endif () + if (APPLE) + set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") + endif () endif () if (NOT DYNAMIC_ARCH) From f06b5355667434f29258c3664bc5ea517988040b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Dec 2023 09:58:44 +0100 Subject: [PATCH 488/718] Use C kernel for dgemv_t due to limitations of the old assembly one --- kernel/x86_64/KERNEL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index bea7036c2..f8278c3b4 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -405,7 +405,7 @@ DGEMVNKERNEL = dgemv_n.S endif ifndef DGEMVTKERNEL -DGEMVTKERNEL = dgemv_t.S +DGEMVTKERNEL = dgemv_t_4.c endif ifndef CGEMVNKERNEL From 544cb8630049650cec9bf4c582fda43f334d5c59 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Dec 2023 14:03:59 +0100 Subject: [PATCH 489/718] Mention C906V instruction set limitation and update DYNAMIC_ARCH lists --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index aaadd0d9c..b8d66ed42 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ```sh make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran ``` - (also known to work on C906) + (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) ### Support for multiple targets in a single library @@ -207,9 +207,11 @@ For **x86_64**, the list of targets this activates contains Prescott, Core2, Neh `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. -On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. +On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled. -For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. +For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build. + +on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support. The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. From 8c143331b0216809343d056b084e29a0ab3311a7 Mon Sep 17 00:00:00 2001 From: barracuda156 Date: Fri, 15 Dec 2023 22:55:52 +0800 Subject: [PATCH 490/718] PPC970: drop -mcpu=970 which seems to produce faulty code Fixes: https://github.com/OpenMathLib/OpenBLAS/issues/4376 --- cmake/cc.cmake | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index d5bf3b7ee..00952e810 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -282,18 +282,21 @@ if (${CORE} STREQUAL POWER8) endif () endif () +# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone +# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case. if (${CORE} STREQUAL PPC970) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=970 -mtune=970 -maltivec -fno-fast-math") + set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math") endif () if (APPLE) set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") endif () endif () +# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency? if (${CORE} STREQUAL PPCG4) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=G4 -mtune=G4 -maltivec -fno-fast-math") + set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math") endif () if (APPLE) set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL") From dab0da8243ce03d2a8d76e614259c099bf87b819 Mon Sep 17 00:00:00 2001 From: Darshan Patel Date: Tue, 19 Dec 2023 13:56:55 +0530 Subject: [PATCH 491/718] Update GEMM param for NEOVERSEV1 --- param.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/param.h b/param.h index ee4640f57..1ec2d16dd 100644 --- a/param.h +++ b/param.h @@ -3396,13 +3396,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_MN 16 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 From 7a4fef4f604db2a5e4e0c4ffaebea220a0646ab1 Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Fri, 15 Dec 2023 12:50:48 +0000 Subject: [PATCH 492/718] Tweak SVE dot kernel This changes the SVE dot kernel to only predicate when necessary as well as streamlining the assembly a bit. The benchmarks seem to indicate this can improve performance by ~33%. --- kernel/arm64/dot_kernel_sve.c | 100 +++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 26 deletions(-) diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index 9c057551e..133de4ab3 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -1,4 +1,5 @@ /*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project Copyright (c) 2022, Arm Ltd All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() +#define DTYPE "d" +#define WIDTH "d" +#define SHIFT "3" #else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() +#define DTYPE "s" +#define WIDTH "w" +#define SHIFT "2" #endif -static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { - SVE_TYPE acc_a = SVE_ZERO; - SVE_TYPE acc_b = SVE_ZERO; +#define COUNT \ +" cnt"WIDTH" x9 \n" +#define SETUP_TRUE \ +" ptrue p0."DTYPE" \n" +#define OFFSET_INPUTS \ +" add x12, %[X_], x9, lsl #"SHIFT" \n" \ +" add x13, %[Y_], x9, lsl #"SHIFT" \n" +#define TAIL_WHILE \ +" whilelo p1."DTYPE", x8, x0 \n" +#define UPDATE(pg, x,y,out) \ +" ld1"WIDTH" { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"] \n" \ +" ld1"WIDTH" { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"] \n" \ +" fmla "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE" \n" +#define SUM_VECTOR(v) \ +" faddv "DTYPE""v", p0, z"v"."DTYPE" \n" +#define RET \ +" fadd %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0 \n" - BLASLONG sve_width = SVE_WIDTH; +#define DOT_KERNEL \ + COUNT \ +" mov z1.d, #0 \n" \ +" mov z0.d, #0 \n" \ +" mov x8, #0 \n" \ +" movi d1, #0x0 \n" \ + SETUP_TRUE \ +" neg x10, x9, lsl #1 \n" \ +" ands x11, x10, x0 \n" \ +" b.eq .Lskip_2x \n" \ + OFFSET_INPUTS \ +".Lvector_2x: \n" \ + UPDATE("p0", "%[X_]", "%[Y_]", "z1") \ + UPDATE("p0", "x12", "x13", "z0") \ +" sub x8, x8, x10 \n" \ +" cmp x8, x11 \n" \ +" b.lo .Lvector_2x \n" \ + SUM_VECTOR("1") \ +".Lskip_2x: \n" \ +" neg x10, x9 \n" \ +" and x10, x10, x0 \n" \ +" cmp x8, x10 \n" \ +" b.hs .Ltail \n" \ +".Lvector_1x: \n" \ + UPDATE("p0", "%[X_]", "%[Y_]", "z0") \ +" add x8, x8, x9 \n" \ +" cmp x8, x10 \n" \ +" b.lo .Lvector_1x \n" \ +".Ltail: \n" \ +" cmp x10, x0 \n" \ +" b.eq .Lend \n" \ + TAIL_WHILE \ + UPDATE("p1", "%[X_]", "%[Y_]", "z0") \ +".Lend: \n" \ + SUM_VECTOR("0") \ + RET - for (BLASLONG i = 0; i < n; i += sve_width * 2) { - svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); - svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); +static +FLOAT +dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y) +{ + FLOAT ret; - SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); - SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); - SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); - SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + asm(DOT_KERNEL + : + [RET_] "=&w" (ret) + : + [N_] "r" (n), + [X_] "r" (x), + [Y_] "r" (y) + :); - acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); - acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); - } - - return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); + return ret; } From 60e66725e40ef5c6b5a8b2214c3b3a633f699c9b Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Tue, 19 Dec 2023 13:11:06 +0000 Subject: [PATCH 493/718] Use numeric labels to allow repeated inlining --- kernel/arm64/dot_kernel_sve.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index 133de4ab3..16f4cd537 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -67,31 +67,31 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SETUP_TRUE \ " neg x10, x9, lsl #1 \n" \ " ands x11, x10, x0 \n" \ -" b.eq .Lskip_2x \n" \ +" b.eq 2f // skip_2x \n" \ OFFSET_INPUTS \ -".Lvector_2x: \n" \ +"1: // vector_2x \n" \ UPDATE("p0", "%[X_]", "%[Y_]", "z1") \ UPDATE("p0", "x12", "x13", "z0") \ " sub x8, x8, x10 \n" \ " cmp x8, x11 \n" \ -" b.lo .Lvector_2x \n" \ +" b.lo 1b // vector_2x \n" \ SUM_VECTOR("1") \ -".Lskip_2x: \n" \ +"2: // skip_2x \n" \ " neg x10, x9 \n" \ " and x10, x10, x0 \n" \ " cmp x8, x10 \n" \ -" b.hs .Ltail \n" \ -".Lvector_1x: \n" \ +" b.hs 4f // tail \n" \ +"3: // vector_1x \n" \ UPDATE("p0", "%[X_]", "%[Y_]", "z0") \ " add x8, x8, x9 \n" \ " cmp x8, x10 \n" \ -" b.lo .Lvector_1x \n" \ -".Ltail: \n" \ +" b.lo 3b // vector_1x \n" \ +"4: // tail \n" \ " cmp x10, x0 \n" \ -" b.eq .Lend \n" \ +" b.eq 5f // end \n" \ TAIL_WHILE \ UPDATE("p1", "%[X_]", "%[Y_]", "z0") \ -".Lend: \n" \ +"5: // end \n" \ SUM_VECTOR("0") \ RET From bb8b91e9f2c8176e7a6388fe7a949744771bb29d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Dec 2023 23:13:02 +0100 Subject: [PATCH 494/718] restore OpenBLAS-specific test paths --- lapack-netlib/lapack_testing.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py index 96fbeb2a6..fea0ff765 100755 --- a/lapack-netlib/lapack_testing.py +++ b/lapack-netlib/lapack_testing.py @@ -255,19 +255,19 @@ for dtype in range_prec: else: if dtest==16: # LIN TESTS - cmdbase="xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==17: # PROTO LIN TESTS - cmdbase="xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==18: # PROTO LIN TESTS - cmdbase="xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" elif dtest==20: # DMD EIG TESTS - cmdbase="xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="EIG/xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" else: # EIG TESTS - cmdbase="xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" + cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out" if not just_errors and not short_summary: print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ') # Run the process: either to read the file or run the LAPACK testing From e67a0eaaf9f02e37270ac03747b333b28dac61e7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Dec 2023 23:15:11 +0100 Subject: [PATCH 495/718] Restore OpenBLAS-specific build rule changes --- lapack-netlib/TESTING/EIG/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index 4e7cf4629..9cf0fc95e 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -136,28 +136,28 @@ double: xeigtstd xdmdeigtstd complex16: xeigtstz xdmdeigtstz xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB) - $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^ + $(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^ $(SDMDEIGTST): $(FRC) $(CDMDEIGTST): $(FRC) From a7ed60bfe93a73df853be560244aa33a774088ac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 21 Dec 2023 20:05:23 +0100 Subject: [PATCH 496/718] Add lower limit for multithreading --- interface/lapack/gesv.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c index 175350329..546c2bed2 100644 --- a/interface/lapack/gesv.c +++ b/interface/lapack/gesv.c @@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.m*args.n < 40000) +#else + if (args.m*args.n < 10000) +#endif + args.nthreads=1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif From ecae1389dff8bcb6643685aedb1fbaeb9b26d79d Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Sat, 23 Dec 2023 12:21:48 +0000 Subject: [PATCH 497/718] Reduce duplication in kernel definitions These files are exactly the same, so I believe we can reduce these files down. Other files require a slightly more complex unpicking. --- kernel/arm64/KERNEL.A64FX | 207 +----------------------------- kernel/arm64/KERNEL.CORTEXA55 | 197 +--------------------------- kernel/arm64/KERNEL.THUNDERX3T110 | 185 +------------------------- 3 files changed, 3 insertions(+), 586 deletions(-) diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index ccbce27e1..bc5999097 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -1,206 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = trsm_kernel_LN_sve.c -STRSMKERNEL_LT = trsm_kernel_LT_sve.c -STRSMKERNEL_RN = trsm_kernel_RN_sve.c -STRSMKERNEL_RT = trsm_kernel_RT_sve.c - -DTRSMKERNEL_LN = trsm_kernel_LN_sve.c -DTRSMKERNEL_LT = trsm_kernel_LT_sve.c -DTRSMKERNEL_RN = trsm_kernel_RN_sve.c -DTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -TRSMCOPYLN_M = trsm_lncopy_sve.c -TRSMCOPYLT_M = trsm_ltcopy_sve.c -TRSMCOPYUN_M = trsm_uncopy_sve.c -TRSMCOPYUT_M = trsm_utcopy_sve.c - -CTRSMKERNEL_LN = trsm_kernel_LN_sve.c -CTRSMKERNEL_LT = trsm_kernel_LT_sve.c -CTRSMKERNEL_RN = trsm_kernel_RN_sve.c -CTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c -ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c -ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c -ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c - -ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c -ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c -ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c -ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c - - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S - -SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c -SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S - -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRMMUNCOPY_M = trmm_uncopy_sve_v1.c -STRMMLNCOPY_M = trmm_lncopy_sve_v1.c -STRMMUTCOPY_M = trmm_utcopy_sve_v1.c -STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -SSYMMUCOPY_M = symm_ucopy_sve.c -SSYMMLCOPY_M = symm_lcopy_sve.c - -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S - -DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c -DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c - -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c - -CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -CHEMMLTCOPY_M = zhemm_ltcopy_sve.c -CHEMMUTCOPY_M = zhemm_utcopy_sve.c - -CSYMMUCOPY_M = zsymm_ucopy_sve.c -CSYMMLCOPY_M = zsymm_lcopy_sve.c - -ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S - -ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c - -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c -ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c -ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c -ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c - -ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c -ZHEMMUTCOPY_M = zhemm_utcopy_sve.c - -ZSYMMUCOPY_M = zsymm_ucopy_sve.c -ZSYMMLCOPY_M = zsymm_lcopy_sve.c +include $(KERNELDIR)/KERNEL.ARMV8SVE diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 index e2e006770..574e98b8c 100644 --- a/kernel/arm64/KERNEL.CORTEXA55 +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -1,196 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -else -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -endif -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +include $(KERNELDIR)/KERNEL.CORTEXA53 diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index 41cedc851..5d3bd69f7 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -1,184 +1 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = daxpy_thunderx2t99.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c -DSDOTKERNEL = dot.S - -ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) -DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) -SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S -endif - -ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) -CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S -endif - -ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) -ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S -endif +include $(KERNELDIR)/KERNEL.THUNDERX2T99 From dc20a7818899cd6fbcf192b93e1940b353f83a0f Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Sat, 23 Dec 2023 12:19:33 +0000 Subject: [PATCH 498/718] Use functionally equivalent dynamic targets Similar to `drivers/other/dynamic.c`, I've looked for functionally equivalent targets and mapped them in the default DYNAMIC_ARCH build. Users can still build specific cores using DYNAMIC_LIST. --- Makefile.system | 4 ---- driver/others/dynamic_arm64.c | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/Makefile.system b/Makefile.system index ff06e503c..e602eaf05 100644 --- a/Makefile.system +++ b/Makefile.system @@ -677,16 +677,12 @@ ifeq ($(ARCH), arm64) DYNAMIC_CORE = ARMV8 DYNAMIC_CORE += CORTEXA53 DYNAMIC_CORE += CORTEXA57 -DYNAMIC_CORE += CORTEXA72 -DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += ARMV8SVE endif -DYNAMIC_CORE += CORTEXA55 -DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 530d18115..b5fb8161d 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -122,10 +122,11 @@ extern gotoblas_t gotoblas_CORTEXA55; #endif #else extern gotoblas_t gotoblas_CORTEXA53; +#define gotoblas_CORTEXA55 gotoblas_CORTEXA53 extern gotoblas_t gotoblas_CORTEXA57; -extern gotoblas_t gotoblas_CORTEXA72; -extern gotoblas_t gotoblas_CORTEXA73; -extern gotoblas_t gotoblas_FALKOR; +#define gotoblas_CORTEXA72 gotoblas_CORTEXA57 +#define gotoblas_CORTEXA73 gotoblas_CORTEXA57 +#define gotoblas_FALKOR gotoblas_CORTEXA57 extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; @@ -141,7 +142,6 @@ extern gotoblas_t gotoblas_ARMV8SVE; #define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; -extern gotoblas_t gotoblas_CORTEXA55; #endif extern void openblas_warning(int verbose, const char * msg); From 8ce44c18a0e81527f7e9db98ee6276af4a408b8f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 19:24:10 +0100 Subject: [PATCH 499/718] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/TESTING/EIG/cerrst.f | 12 ++++++------ lapack-netlib/TESTING/EIG/chkxer.f | 2 +- lapack-netlib/TESTING/EIG/derrst.f | 4 ++-- lapack-netlib/TESTING/EIG/serrst.f | 4 ++-- lapack-netlib/TESTING/EIG/zerrst.f | 12 ++++++------ 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f index 1748a2aad..d23eb14ea 100644 --- a/lapack-netlib/TESTING/EIG/cerrst.f +++ b/lapack-netlib/TESTING/EIG/cerrst.f @@ -748,17 +748,17 @@ CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK ) NT = NT + 12 @@ -830,19 +830,19 @@ INFOT = 18 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0, 0.0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 13 diff --git a/lapack-netlib/TESTING/EIG/chkxer.f b/lapack-netlib/TESTING/EIG/chkxer.f index fd00bb65a..70caf7e0a 100644 --- a/lapack-netlib/TESTING/EIG/chkxer.f +++ b/lapack-netlib/TESTING/EIG/chkxer.f @@ -61,7 +61,7 @@ RETURN * 9999 FORMAT( ' *** Illegal value of parameter number ', I2, - $ ' not detected by ', A6, ' ***' ) + $ ' not detected by ', A, ' ***' ) * * End of CHKXER * diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f index 059538644..7d111e2e0 100644 --- a/lapack-netlib/TESTING/EIG/derrst.f +++ b/lapack-netlib/TESTING/EIG/derrst.f @@ -735,12 +735,12 @@ CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N, + $ M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N, $ INFO ) CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0, $ INFO ) CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK ) NT = NT + 11 diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f index b87fc42ef..408346382 100644 --- a/lapack-netlib/TESTING/EIG/serrst.f +++ b/lapack-netlib/TESTING/EIG/serrst.f @@ -733,12 +733,12 @@ CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N, + $ M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N, $ INFO ) CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0, - $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0, $ INFO ) CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK ) NT = NT + 11 diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f index d7b41c053..31881c4de 100644 --- a/lapack-netlib/TESTING/EIG/zerrst.f +++ b/lapack-netlib/TESTING/EIG/zerrst.f @@ -748,17 +748,17 @@ CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 18 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK ) NT = NT + 12 @@ -830,19 +830,19 @@ INFOT = 18 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ), + $ M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 20 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ), + $ M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ), $ 10*N, INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) INFOT = 22 CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1, $ 0.0D0, 0.0D0, 1, 1, 0.0D0, - $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1, + $ M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0, $ INFO ) CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK ) NT = NT + 13 From 5c11b2ff41607ef923593ff73f0317aa93be30a5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 19:27:20 +0100 Subject: [PATCH 500/718] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f | 7 +++---- lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f | 4 ++-- lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f | 5 ++--- lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f | 4 ++-- lapack-netlib/TESTING/LIN/schksy_aa_2stage.f | 5 ++--- lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f | 4 ++-- lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f | 11 ++++++----- lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f | 4 ++-- 8 files changed, 21 insertions(+), 23 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f index 30a61261f..d79978e55 100644 --- a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f @@ -433,9 +433,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'CHETRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) - CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX) + CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -517,7 +517,6 @@ c NT = 1 CALL CLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'CHETRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL CHETRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f index 51cef512d..83e8a17b0 100644 --- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using CHESV_AA. * SRNAMT = 'CHESV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX) CALL CHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f index bc4e77a5a..1940351a4 100644 --- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f @@ -421,9 +421,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'DSYTRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL DSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -503,7 +503,6 @@ c NT = 1 CALL DLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'DSYTRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL DSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f index 91c9e8e9a..d04106ae3 100644 --- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using DSYSV_AA. * SRNAMT = 'DSYSV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL DSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f index d3c27ae56..6490cd7c3 100644 --- a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f @@ -423,9 +423,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'SSYTRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL SSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) @@ -505,7 +505,6 @@ CALL SLACPY( 'Full', N, NRHS, B, LDA, X, LDA ) * SRNAMT = 'SSYTRS_AA_2STAGE' - LWORK = MAX( 1, 3*N-2 ) CALL SSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, $ AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ), $ X, LDA, INFO ) diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f index aff32bce9..319b90805 100644 --- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using SSYSV_AA. * SRNAMT = 'SSYSV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL SSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * diff --git a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f index 381fac9f2..51082f1d0 100644 --- a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f @@ -8,7 +8,7 @@ * Definition: * =========== * -* SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, +* SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, * NNS, NSVAL, THRESH, TSTERR, NMAX, A, * AFAC, AINV, B, X, XACT, WORK, RWORK, * IWORK, NOUT ) @@ -185,7 +185,8 @@ LOGICAL DOTYPE( * ) INTEGER IWORK( * ), NBVAL( * ), NSVAL( * ), NVAL( * ) COMPLEX*16 A( * ), AFAC( * ), AINV( * ), B( * ), - $ RWORK( * ), WORK( * ), X( * ), XACT( * ) + $ WORK( * ), X( * ), XACT( * ) + DOUBLE PRECISION RWORK( * ) * .. * * ===================================================================== @@ -430,9 +431,9 @@ * block factorization, LWORK is the length of AINV. * SRNAMT = 'ZHETRF_AA_2STAGE' - LWORK = MIN(N*NB, 3*NMAX*NMAX) - CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, - $ AINV, (3*NB+1)*N, + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) + CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f index 9401867e0..fcd774491 100644 --- a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f @@ -400,9 +400,9 @@ * Factor the matrix and solve the system using ZHESV_AA. * SRNAMT = 'ZHESV_AA_2STAGE ' - LWORK = MIN(N*NB, 3*NMAX*NMAX) + LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX ) CALL ZHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA, - $ AINV, (3*NB+1)*N, + $ AINV, MAX( 1, (3*NB+1)*N ), $ IWORK, IWORK( 1+N ), $ X, LDA, WORK, LWORK, INFO ) * From 0814491d968da09eebee620fe573750a4b703944 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 19:37:03 +0100 Subject: [PATCH 501/718] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/SRC/cgebrd.f | 32 ++++++---- lapack-netlib/SRC/cgehrd.f | 20 +++--- lapack-netlib/SRC/cgelq.f | 8 +-- lapack-netlib/SRC/cgelqf.f | 20 +++--- lapack-netlib/SRC/cgemlq.f | 35 +++++++---- lapack-netlib/SRC/cgemqr.f | 31 ++++++--- lapack-netlib/SRC/cgeqlf.f | 12 ++-- lapack-netlib/SRC/cgeqp3rk.f | 24 +++---- lapack-netlib/SRC/cgeqr.f | 23 ++++--- lapack-netlib/SRC/cgeqrfp.f | 26 +++++--- lapack-netlib/SRC/cgesvdx.f | 14 ++--- lapack-netlib/SRC/cgesvj.f | 70 ++++++++++++--------- lapack-netlib/SRC/cgetri.f | 6 +- lapack-netlib/SRC/cgetsls.f | 7 ++- lapack-netlib/SRC/cgetsqrhrt.f | 27 +++++--- lapack-netlib/SRC/cgges3.f | 40 +++++++----- lapack-netlib/SRC/cggev3.f | 28 +++++---- lapack-netlib/SRC/cgghd3.f | 22 ++++--- lapack-netlib/SRC/cggqrf.f | 6 +- lapack-netlib/SRC/cggrqf.f | 6 +- lapack-netlib/SRC/cggsvd3.f | 4 +- lapack-netlib/SRC/cggsvp3.f | 4 +- lapack-netlib/SRC/cheevd.f | 11 ++-- lapack-netlib/SRC/cheevr.f | 31 +++++---- lapack-netlib/SRC/cheevr_2stage.f | 57 ++++++++++------- lapack-netlib/SRC/cheevx.f | 6 +- lapack-netlib/SRC/chesv_aa.f | 15 ++--- lapack-netlib/SRC/chesv_aa_2stage.f | 24 +++---- lapack-netlib/SRC/chesvx.f | 11 ++-- lapack-netlib/SRC/chetrd_2stage.f | 94 +++++++++++++++------------- lapack-netlib/SRC/chetrd_hb2st.F | 40 +++++++----- lapack-netlib/SRC/chetrd_he2hb.f | 20 +++--- lapack-netlib/SRC/chetrf.f | 8 +-- lapack-netlib/SRC/chetrf_aa.f | 29 ++++++--- lapack-netlib/SRC/chetrf_aa_2stage.f | 25 ++++---- lapack-netlib/SRC/chetrf_rk.f | 10 +-- lapack-netlib/SRC/chetrf_rook.f | 6 +- lapack-netlib/SRC/chetri2.f | 30 +++++---- lapack-netlib/SRC/chetri_3.f | 21 ++++--- lapack-netlib/SRC/chetrs_aa.f | 27 +++++--- lapack-netlib/SRC/clamswlq.f | 66 +++++++++++-------- lapack-netlib/SRC/clamtsqr.f | 74 ++++++++++++---------- lapack-netlib/SRC/claswlq.f | 80 +++++++++++++---------- lapack-netlib/SRC/clatrs3.f | 32 +++++++--- lapack-netlib/SRC/clatsqr.f | 89 +++++++++++++++----------- lapack-netlib/SRC/dsytrf.f | 5 +- lapack-netlib/SRC/ssytrd_sb2st.F | 40 +++++++----- 47 files changed, 783 insertions(+), 533 deletions(-) diff --git a/lapack-netlib/SRC/cgebrd.f b/lapack-netlib/SRC/cgebrd.f index 5687161a5..5920b1cf5 100644 --- a/lapack-netlib/SRC/cgebrd.f +++ b/lapack-netlib/SRC/cgebrd.f @@ -123,7 +123,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -148,7 +149,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -225,8 +226,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL CGEBD2, CGEMM, CLABRD, XERBLA @@ -236,16 +237,24 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, SROUNDUP_LWORK * .. * .. Executable Statements .. * * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB - WORK( 1 ) = REAL( LWKOPT ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -253,7 +262,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -265,7 +274,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -284,7 +292,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using @@ -343,7 +351,7 @@ * CALL CGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = WS + WORK( 1 ) = SROUNDUP_LWORK( WS ) RETURN * * End of CGEBRD diff --git a/lapack-netlib/SRC/cgehrd.f b/lapack-netlib/SRC/cgehrd.f index f407f931a..7ba87cc01 100644 --- a/lapack-netlib/SRC/cgehrd.f +++ b/lapack-netlib/SRC/cgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -222,13 +222,19 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -249,7 +255,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -269,7 +274,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -345,7 +350,8 @@ * Use unblocked code to reduce the rest of the matrix * CALL CGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f index ff482bc42..24aaa982e 100644 --- a/lapack-netlib/SRC/cgelq.f +++ b/lapack-netlib/SRC/cgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -295,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -322,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f index 75f5bc960..3847a958a 100644 --- a/lapack-netlib/SRC/cgelqf.f +++ b/lapack-netlib/SRC/cgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -175,9 +176,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -185,19 +185,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -267,7 +273,7 @@ $ CALL CGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGELQF diff --git a/lapack-netlib/SRC/cgemlq.f b/lapack-netlib/SRC/cgemlq.f index e0cf78bc0..e5b02b669 100644 --- a/lapack-netlib/SRC/cgemlq.f +++ b/lapack-netlib/SRC/cgemlq.f @@ -110,16 +110,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -143,7 +144,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -159,11 +160,13 @@ *> block sizes MB and NB returned by ILAENV, CGELQ will use either *> CLASWLQ (if the matrix is wide-and-short) or CGELQT to compute *> the LQ factorization. -*> This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to +*> This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in CLAMSWLQ or CGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE CGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -185,11 +188,12 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLAMSWLQ, CGEMLQT, XERBLA @@ -201,7 +205,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -216,6 +220,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -244,12 +255,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -261,7 +272,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -274,7 +285,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = REAL( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/cgemqr.f b/lapack-netlib/SRC/cgemqr.f index ea9de146e..0b7dd9dd7 100644 --- a/lapack-netlib/SRC/cgemqr.f +++ b/lapack-netlib/SRC/cgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -166,6 +167,8 @@ *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE CGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,11 +190,12 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGEMQRT, CLAMTSQR, XERBLA @@ -203,7 +207,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +222,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -251,7 +262,7 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +274,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +287,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqlf.f b/lapack-netlib/SRC/cgeqlf.f index 918bbddad..6c67344c5 100644 --- a/lapack-netlib/SRC/cgeqlf.f +++ b/lapack-netlib/SRC/cgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -187,10 +188,11 @@ NB = ILAENV( 1, 'CGEQLF', ' ', M, N, -1, -1 ) LWKOPT = N*NB END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * @@ -277,7 +279,7 @@ IF( MU.GT.0 .AND. NU.GT.0 ) $ CALL CGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGEQLF diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f index 587860684..731c44edb 100644 --- a/lapack-netlib/SRC/cgeqp3rk.f +++ b/lapack-netlib/SRC/cgeqp3rk.f @@ -428,7 +428,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= N+NRHS-1 +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= N+NRHS-1, otherwise. *> For optimal performance LWORK >= NB*( N+NRHS+1 ), *> where NB is the optimal block size for CGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. @@ -627,8 +628,9 @@ * .. External Functions .. LOGICAL SISNAN INTEGER ISAMAX, ILAENV - REAL SLAMCH, SCNRM2 - EXTERNAL SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV + REAL SLAMCH, SCNRM2, SROUNDUP_LWORK + EXTERNAL SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV, + $ SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC CMPLX, MAX, MIN @@ -703,7 +705,7 @@ * LWKOPT = 2*N + NB*( N+NRHS+1 ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN INFO = -15 @@ -726,7 +728,7 @@ K = 0 MAXC2NRMK = ZERO RELMAXC2NRMK = ZERO - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -778,7 +780,7 @@ * * Array TAU is not set and contains undefined elements. * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -797,7 +799,7 @@ TAU( J ) = CZERO END DO * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * END IF @@ -828,7 +830,7 @@ DO J = 1, MINMN TAU( J ) = CZERO END DO - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -873,7 +875,7 @@ TAU( J ) = CZERO END DO * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -991,7 +993,7 @@ * * Return from the routine. * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * @@ -1082,7 +1084,7 @@ * END IF * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqr.f b/lapack-netlib/SRC/cgeqr.f index d10e3da65..3617594d0 100644 --- a/lapack-netlib/SRC/cgeqr.f +++ b/lapack-netlib/SRC/cgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE CGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,11 +190,12 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME - EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLATSQR, CGEQRT, XERBLA @@ -244,8 +247,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +258,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +273,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +287,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +314,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f index eaf98ddf3..5b6226c67 100644 --- a/lapack-netlib/SRC/cgeqrfp.f +++ b/lapack-netlib/SRC/cgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL CGEQR2P, CLARFB, CLARFT, XERBLA @@ -182,8 +183,16 @@ * INFO = 0 NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -191,7 +200,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -203,7 +212,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -211,7 +219,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. @@ -273,7 +281,7 @@ $ CALL CGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGEQRFP diff --git a/lapack-netlib/SRC/cgesvdx.f b/lapack-netlib/SRC/cgesvdx.f index fbdb121ca..e1856a65f 100644 --- a/lapack-netlib/SRC/cgesvdx.f +++ b/lapack-netlib/SRC/cgesvdx.f @@ -208,7 +208,7 @@ *> \param[out] WORK *> \verbatim *> WORK is COMPLEX array, dimension (MAX(1,LWORK)) -*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK; +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK @@ -261,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup gesvdx * * ===================================================================== SUBROUTINE CGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU, @@ -312,8 +312,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE - EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE + REAL SLAMCH, CLANGE, SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, SQRT @@ -448,7 +448,7 @@ END IF END IF MAXWRK = MAX( MAXWRK, MINWRK ) - WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN INFO = -19 @@ -464,7 +464,7 @@ * * Quick return if possible * - IF( M.EQ.0 .OR. N.EQ.0 ) THEN + IF( MINMN.EQ.0 ) THEN RETURN END IF * @@ -846,7 +846,7 @@ * * Return optimal workspace in WORK(1) * - WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO ) + WORK( 1 ) = SROUNDUP_LWORK( MAXWRK ) * RETURN * diff --git a/lapack-netlib/SRC/cgesvj.f b/lapack-netlib/SRC/cgesvj.f index 149cf5e48..b9c8f1709 100644 --- a/lapack-netlib/SRC/cgesvj.f +++ b/lapack-netlib/SRC/cgesvj.f @@ -208,15 +208,17 @@ *> \verbatim *> CWORK is COMPLEX array, dimension (max(1,LWORK)) *> Used as workspace. -*> If on entry LWORK = -1, then a workspace query is assumed and -*> no computation is done; CWORK(1) is set to the minial (and optimal) -*> length of CWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER. -*> Length of CWORK, LWORK >= M+N. +*> Length of CWORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; CWORK(1) is set to the minial (and optimal) +*> length of CWORK. *> \endverbatim *> *> \param[in,out] RWORK @@ -247,15 +249,17 @@ *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK = -1, then a workspace query is assumed and -*> no computation is done; RWORK(1) is set to the minial (and optimal) -*> length of RWORK. *> \endverbatim *> *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> Length of RWORK, LRWORK >= MAX(6,N). +*> Length of RWORK. +*> LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise +*> +*> If on entry LRWORK = -1, then a workspace query is assumed and +*> no computation is done; RWORK(1) is set to the minial (and optimal) +*> length of RWORK. *> \endverbatim *> *> \param[out] INFO @@ -276,7 +280,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -374,16 +378,17 @@ PARAMETER ( NSWEEP = 30 ) * .. * .. Local Scalars .. - COMPLEX AAPQ, OMPQ - REAL AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, - $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, - $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, - $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL - INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, - $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, - $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + COMPLEX AAPQ, OMPQ + REAL AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, + $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, + $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, + $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL + INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, + $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, + $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND, + $ MINMN, LWMIN, LRWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, + $ RSVEC, UCTOL, UPPER * .. * .. * .. Intrinsic Functions .. @@ -398,8 +403,8 @@ INTEGER ISAMAX EXTERNAL ISAMAX * from LAPACK - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK LOGICAL LSAME EXTERNAL LSAME * .. @@ -422,7 +427,16 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * - LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + LRWMIN = 1 + ELSE + LWMIN = M + N + LRWMIN = MAX( 6, N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -442,9 +456,9 @@ INFO = -11 ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.( M+N ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 - ELSE IF( LRWORK.LT.MAX( N, 6 ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -15 ELSE INFO = 0 @@ -454,15 +468,15 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGESVJ', -INFO ) RETURN - ELSE IF ( LQUERY ) THEN - CWORK(1) = M + N - RWORK(1) = MAX( N, 6 ) + ELSE IF( LQUERY ) THEN + CWORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/cgetri.f b/lapack-netlib/SRC/cgetri.f index 2060d1444..2eb3da7ab 100644 --- a/lapack-netlib/SRC/cgetri.f +++ b/lapack-netlib/SRC/cgetri.f @@ -153,8 +153,8 @@ * INFO = 0 NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -252,7 +252,7 @@ $ CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of CGETRI diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f index b4bb7562f..3f43dc8de 100644 --- a/lapack-netlib/SRC/cgetsls.f +++ b/lapack-netlib/SRC/cgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -229,7 +229,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF ( M.GE.N ) THEN CALL CGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f index 4e4dc1d4a..087e9bc7f 100644 --- a/lapack-netlib/SRC/cgetsqrhrt.f +++ b/lapack-netlib/SRC/cgetsqrhrt.f @@ -131,13 +131,15 @@ *> \param[in] LWORK *> \verbatim *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +162,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup comlpexOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -200,6 +202,10 @@ INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS * .. +* .. External Functions .. + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL, $ XERBLA @@ -212,7 +218,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +231,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +269,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -277,14 +284,14 @@ CALL XERBLA( 'CGETSQRHRT', -INFO ) RETURN ELSE IF ( LQUERY ) THEN - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * * Quick return if possible * IF( MIN( M, N ).EQ.0 ) THEN - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN END IF * @@ -341,9 +348,9 @@ END IF END DO * - WORK( 1 ) = CMPLX( LWORKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT ) RETURN * * End of CGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/cgges3.f b/lapack-netlib/SRC/cgges3.f index aac9f9510..c1ca79688 100644 --- a/lapack-netlib/SRC/cgges3.f +++ b/lapack-netlib/SRC/cgges3.f @@ -215,7 +215,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -260,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE CGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -300,7 +301,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT + $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SMLNUM * .. @@ -310,13 +312,12 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY, - $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE, SLAMCH - EXTERNAL LSAME, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC MAX, SQRT @@ -353,6 +354,8 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -369,7 +372,7 @@ INFO = -14 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -16 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * @@ -377,29 +380,33 @@ * IF( INFO.EQ.0 ) THEN CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) ) CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL CUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, $ IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) END IF CALL CGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) CALL CLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1, $ RWORK, 0, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL CTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM, $ PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) END IF * @@ -422,7 +429,6 @@ EPS = SLAMCH( 'P' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -585,7 +591,7 @@ * 30 CONTINUE * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f index 9483ecdeb..d2b75aebc 100644 --- a/lapack-netlib/SRC/cggev3.f +++ b/lapack-netlib/SRC/cggev3.f @@ -174,7 +174,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -208,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE CGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -243,7 +244,7 @@ CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, $ IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR, - $ LWKOPT + $ LWKOPT, LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP COMPLEX X @@ -253,13 +254,12 @@ * .. * .. External Subroutines .. EXTERNAL CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY, - $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD, - $ XERBLA + $ CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA * .. * .. External Functions .. LOGICAL LSAME - REAL CLANGE, SLAMCH - EXTERNAL LSAME, CLANGE, SLAMCH + REAL CLANGE, SLAMCH, SROUNDUP_LWORK + EXTERNAL LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, REAL, SQRT @@ -301,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -315,7 +316,7 @@ INFO = -11 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF * @@ -323,7 +324,7 @@ * IF( INFO.EQ.0 ) THEN CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( N, N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) ) CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) @@ -348,7 +349,11 @@ $ RWORK, 0, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = CMPLX( LWKOPT ) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -368,7 +373,6 @@ EPS = SLAMCH( 'E' )*SLAMCH( 'B' ) SMLNUM = SLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * @@ -549,7 +553,7 @@ IF( ILBSCL ) $ CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR ) * - WORK( 1 ) = CMPLX( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CGGEV3 diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f index 1074b4828..f7175a72c 100644 --- a/lapack-netlib/SRC/cgghd3.f +++ b/lapack-netlib/SRC/cgghd3.f @@ -180,14 +180,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -212,7 +212,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -265,7 +265,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL ILAENV, LSAME + REAL SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CGGHRD, CLARTG, CLASET, CUNM22, CROT, CGEMM, @@ -280,8 +281,13 @@ * INFO = 0 NB = ILAENV( 1, 'CGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) - WORK( 1 ) = CMPLX( LWKOPT ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) INITZ = LSAME( COMPZ, 'I' ) @@ -330,7 +336,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = CONE RETURN @@ -888,7 +893,8 @@ IF ( JCOL.LT.IHI ) $ CALL CGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) - WORK( 1 ) = CMPLX( LWKOPT ) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f index 29b0bf4af..309f170e8 100644 --- a/lapack-netlib/SRC/cggqrf.f +++ b/lapack-netlib/SRC/cggqrf.f @@ -251,8 +251,8 @@ NB2 = ILAENV( 1, 'CGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'CUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -288,7 +288,7 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL CGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) ) * RETURN * diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f index 273ab3ef7..8470a1ce2 100644 --- a/lapack-netlib/SRC/cggrqf.f +++ b/lapack-netlib/SRC/cggrqf.f @@ -250,8 +250,8 @@ NB2 = ILAENV( 1, 'CGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'CUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -288,7 +288,7 @@ * QR factorization of P-by-N matrix B: B = Z*T * CALL CGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO ) - WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) ) * RETURN * diff --git a/lapack-netlib/SRC/cggsvd3.f b/lapack-netlib/SRC/cggsvd3.f index f248aebd5..4c4b85bae 100644 --- a/lapack-netlib/SRC/cggsvd3.f +++ b/lapack-netlib/SRC/cggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -333,7 +333,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/cggsvp3.f b/lapack-netlib/SRC/cggsvp3.f index 008a053a2..e19f7efd5 100644 --- a/lapack-netlib/SRC/cggsvp3.f +++ b/lapack-netlib/SRC/cggsvp3.f @@ -233,7 +233,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -256,7 +256,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f index b5ca804eb..9b62a2df6 100644 --- a/lapack-netlib/SRC/cheevd.f +++ b/lapack-netlib/SRC/cheevd.f @@ -116,8 +116,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is REAL array, -*> dimension (LRWORK) +*> RWORK is REAL array, dimension (MAX(1,LRWORK)) *> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. *> \endverbatim *> @@ -282,8 +281,8 @@ LROPT = LRWMIN LIOPT = LIWMIN END IF - WORK( 1 ) = SROUNDUP_LWORK(LOPT) - RWORK( 1 ) = LROPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LROPT ) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -378,8 +377,8 @@ CALL SSCAL( IMAX, ONE / SIGMA, W, 1 ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LOPT) - RWORK( 1 ) = LROPT + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LROPT ) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f index 05c5e66be..ad5c8cd4a 100644 --- a/lapack-netlib/SRC/cheevr.f +++ b/lapack-netlib/SRC/cheevr.f @@ -272,7 +272,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,2*N). +*> The length of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 2*N. *> For optimal efficiency, LWORK >= (NB+1)*N, *> where NB is the max of the blocksize for CHETRD and for *> CUNMTR as returned by ILAENV. @@ -294,7 +295,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -313,7 +315,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -417,9 +420,15 @@ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. $ ( LIWORK.EQ.-1 ) ) * - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) - LWMIN = MAX( 1, 2*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 2*N + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -454,8 +463,8 @@ NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -483,7 +492,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = REAL( A( 1, 1 ) ) @@ -710,8 +719,8 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cheevr_2stage.f b/lapack-netlib/SRC/cheevr_2stage.f index 0332a09bc..e06925fcd 100644 --- a/lapack-netlib/SRC/cheevr_2stage.f +++ b/lapack-netlib/SRC/cheevr_2stage.f @@ -265,7 +265,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the unitary transformations applied by CUNMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -279,12 +279,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -310,7 +311,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -329,7 +331,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -354,7 +357,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -382,7 +385,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -390,11 +393,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -443,8 +446,9 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV, ILAENV2STAGE - REAL SLAMCH, CLANSY - EXTERNAL LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE + REAL SLAMCH, CLANSY, SROUNDUP_LWORK + EXTERNAL LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL, @@ -472,9 +476,16 @@ IB = ILAENV2STAGE( 2, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = N + LHTRD + LWTRD - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = N + LHTRD + LWTRD + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -506,8 +517,8 @@ END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LWMIN - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -535,7 +546,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = REAL( A( 1, 1 ) ) @@ -643,9 +654,9 @@ * * Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. * - CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), - $ WORK( INDHOUS ), LHTRD, + $ WORK( INDHOUS ), LHTRD, $ WORK( INDWK ), LLWORK, IINFO ) * * If all eigenvalues are desired @@ -666,7 +677,7 @@ CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 ) CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 ) * - IF (ABSTOL .LE. TWO*N*EPS) THEN + IF ( ABSTOL .LE. TWO*N*EPS ) THEN TRYRAC = .TRUE. ELSE TRYRAC = .FALSE. @@ -765,8 +776,8 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN - RWORK( 1 ) = LRWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/cheevx.f b/lapack-netlib/SRC/cheevx.f index e91599a44..a8a2bde63 100644 --- a/lapack-netlib/SRC/cheevx.f +++ b/lapack-netlib/SRC/cheevx.f @@ -348,14 +348,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 2*N NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) ) - LWKOPT = MAX( 1, ( NB + 1 )*N ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = ( NB + 1 )*N END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f index 53ecc0a16..0f41c9332 100644 --- a/lapack-netlib/SRC/chesv_aa.f +++ b/lapack-netlib/SRC/chesv_aa.f @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS + INTEGER LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS * .. * .. External Functions .. LOGICAL LSAME @@ -197,6 +197,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -207,18 +208,18 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_HETRF = INT( WORK(1) ) + LWKOPT_HETRF = INT( WORK( 1 ) ) CALL CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_HETRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT_HETRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -240,7 +241,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f index 12950c4af..05ebd9253 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.f +++ b/lapack-netlib/SRC/chesv_aa_2stage.f @@ -99,14 +99,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX array, dimension (MAX(1,LTB)). *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -146,14 +146,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -203,7 +204,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME @@ -225,6 +226,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -233,18 +235,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -254,7 +257,6 @@ RETURN END IF * -* * Compute the factorization A = U**H*T*U or A = L*T*L**H. * CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, @@ -268,7 +270,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chesvx.f b/lapack-netlib/SRC/chesvx.f index c23a35ce7..bdaad55ec 100644 --- a/lapack-netlib/SRC/chesvx.f +++ b/lapack-netlib/SRC/chesvx.f @@ -307,7 +307,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB REAL ANORM * .. * .. External Functions .. @@ -329,6 +329,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -346,17 +347,17 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 2*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -405,7 +406,7 @@ IF( RCOND.LT.SLAMCH( 'Epsilon' ) ) $ INFO = N + 1 * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chetrd_2stage.f b/lapack-netlib/SRC/chetrd_2stage.f index f5ad35f27..ec7075798 100644 --- a/lapack-netlib/SRC/chetrd_2stage.f +++ b/lapack-netlib/SRC/chetrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download CHETRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download CHETRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * COMPLEX A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the unitary *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is COMPLEX array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX array, dimension (LHOUS2) +*> HOUS2 is COMPLEX array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2=-1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,13 +145,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -157,9 +162,9 @@ *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -250,7 +255,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV2STAGE - EXTERNAL LSAME, ILAENV2STAGE + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV2STAGE, SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -265,10 +271,13 @@ * KD = ILAENV2STAGE( 1, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -285,8 +294,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + HOUS2( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -309,14 +318,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN END IF - CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRD_HB2ST', -INFO ) @@ -324,8 +333,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_2STAGE diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F index 3688e40a3..b0d3e45fb 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.F +++ b/lapack-netlib/SRC/chetrd_hb2st.F @@ -132,15 +132,17 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is COMPLEX array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is COMPLEX array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error @@ -152,14 +154,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension LWORK. +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -262,7 +267,7 @@ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ NBTILES, TTYPE, TID, NTHREADS, $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SICEV, SIZETAU, LDV, LHMIN, LWMIN @@ -286,7 +291,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -295,9 +299,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -318,8 +327,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + HOUS( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -575,8 +584,7 @@ C END IF 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_HB2ST diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f index 090f02100..42e71e0b2 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.f +++ b/lapack-netlib/SRC/chetrd_he2hb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY). +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -294,8 +296,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -314,7 +320,7 @@ CALL XERBLA( 'CHETRD_HE2HB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN END IF * @@ -507,7 +513,7 @@ END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CHETRD_HE2HB diff --git a/lapack-netlib/SRC/chetrf.f b/lapack-netlib/SRC/chetrf.f index 0c596ffe7..2836e30bc 100644 --- a/lapack-netlib/SRC/chetrf.f +++ b/lapack-netlib/SRC/chetrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> \endverbatim *> @@ -228,8 +228,8 @@ * Determine the block size * NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -347,7 +347,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f index 0547a4eab..51410a6ed 100644 --- a/lapack-netlib/SRC/chetrf_aa.f +++ b/lapack-netlib/SRC/chetrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= 2*N. For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -128,7 +130,7 @@ *> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB COMPLEX ALPHA * .. @@ -179,19 +181,26 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.( 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -203,11 +212,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN A( 1, 1 ) = REAL( A( 1, 1 ) ) RETURN END IF @@ -460,7 +469,7 @@ END IF * 20 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_AA diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f index 400efdf26..a79343753 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.f +++ b/lapack-netlib/SRC/chetrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX array, dimension (LTB) +*> TB is COMPLEX array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX workspace of size LWORK +*> WORK is COMPLEX workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used +*> to select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -188,7 +188,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. @@ -213,9 +214,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -229,10 +230,10 @@ NB = ILAENV( 1, 'CHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -241,7 +242,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/chetrf_rk.f b/lapack-netlib/SRC/chetrf_rk.f index ef442c937..a13c740e3 100644 --- a/lapack-netlib/SRC/chetrf_rk.f +++ b/lapack-netlib/SRC/chetrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension ( MAX(1,LWORK) ). +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -311,8 +311,8 @@ * Determine the block size * NB = ILAENV( 1, 'CHETRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -488,7 +488,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_RK diff --git a/lapack-netlib/SRC/chetrf_rook.f b/lapack-netlib/SRC/chetrf_rook.f index 1593c2edc..df0323520 100644 --- a/lapack-netlib/SRC/chetrf_rook.f +++ b/lapack-netlib/SRC/chetrf_rook.f @@ -122,7 +122,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -264,7 +264,7 @@ * NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -387,7 +387,7 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of CHETRF_ROOK diff --git a/lapack-netlib/SRC/chetri2.f b/lapack-netlib/SRC/chetri2.f index 2865a6440..f15065ae7 100644 --- a/lapack-netlib/SRC/chetri2.f +++ b/lapack-netlib/SRC/chetri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (N+NB+1)*(NB+3) +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexHEcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE CHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CHETRI2X, CHETRI, XERBLA @@ -159,9 +160,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +178,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = SROUNDUP_LWORK( MINSIZE ) RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL CHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL CHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of CHETRI2 diff --git a/lapack-netlib/SRC/chetri_3.f b/lapack-netlib/SRC/chetri_3.f index deda63598..ccfce5070 100644 --- a/lapack-netlib/SRC/chetri_3.f +++ b/lapack-netlib/SRC/chetri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX array, dimension (N+NB+1)*(NB+3). +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -209,8 +210,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -218,7 +224,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -226,7 +232,6 @@ CALL XERBLA( 'CHETRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -237,7 +242,7 @@ * CALL CHETRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f index 879549106..07179ab92 100644 --- a/lapack-netlib/SRC/chetrs_aa.f +++ b/lapack-netlib/SRC/chetrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -151,24 +157,30 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME,SROUNDUP_LWORK + EXTERNAL LSAME, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLACPY, CLACGV, CGTSV, CSWAP, CTRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +191,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CHETRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKMIN ) RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f index 5daf60bf6..8f474a3ab 100644 --- a/lapack-netlib/SRC/clamswlq.f +++ b/lapack-netlib/SRC/clamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -193,91 +196,100 @@ *> * ===================================================================== SUBROUTINE CLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. External Subroutines .. - EXTERNAL CTPMLQT, CGEMLQT, XERBLA + EXTERNAL CTPMLQT, CGEMLQT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMSWLQ', -INFO ) - WORK(1) = SROUNDUP_LWORK(LW) RETURN - ELSE IF (LQUERY) THEN - WORK(1) = SROUNDUP_LWORK(LW) + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN CALL CGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN END IF * @@ -404,7 +416,7 @@ * END IF * - WORK(1) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLAMSWLQ diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f index 05021e642..13625087f 100644 --- a/lapack-netlib/SRC/clamtsqr.f +++ b/lapack-netlib/SRC/clamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -195,45 +197,47 @@ *> * ===================================================================== SUBROUTINE CLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. External Subroutines .. - EXTERNAL CGEMQRT, CTPMQRT, XERBLA + EXTERNAL CGEMQRT, CTPMQRT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -241,11 +245,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -256,38 +266,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(LW) + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL CGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -412,7 +422,7 @@ * END IF * - WORK(1) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLAMTSQR diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f index 12e8373df..2044e055c 100644 --- a/lapack-netlib/SRC/claswlq.f +++ b/lapack-netlib/SRC/claswlq.f @@ -96,22 +96,24 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -163,33 +165,35 @@ *> * ===================================================================== SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), T( LDT, *) + COMPLEX A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME INTEGER ILAENV REAL SROUNDUP_LWORK EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL CGELQT, CTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -200,12 +204,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -213,60 +224,61 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(MB*M) +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN CALL CGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M , (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), $ LDA, T(1,CTR*M+1), $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL CTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), $ LDA, T(1,CTR*M+1), LDT, $ WORK, INFO ) - END IF + END IF * - WORK( 1 ) = SROUNDUP_LWORK(M * MB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLASWLQ diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f index 0502f6898..354141a8b 100644 --- a/lapack-netlib/SRC/clatrs3.f +++ b/lapack-netlib/SRC/clatrs3.f @@ -152,13 +152,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal size of *> WORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -166,6 +170,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -182,7 +187,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -257,15 +262,16 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - REAL SLAMCH, CLANGE, SLARMM - EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM + REAL SLAMCH, CLANGE, SLARMM, SROUNDUP_LWORK + EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM, + $ SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL CLATRS, CSSCAL, XERBLA @@ -296,15 +302,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * Test the input parameters. * @@ -326,7 +341,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -659,6 +674,9 @@ END IF END DO END DO +* + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) +* RETURN * * End of CLATRS3 diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f index cd2cb4aa7..67403693f 100644 --- a/lapack-netlib/SRC/clatsqr.f +++ b/lapack-netlib/SRC/clatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -165,32 +168,34 @@ *> * ===================================================================== SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - COMPLEX A( LDA, * ), WORK( * ), T(LDT, *) + COMPLEX A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, LWMIN, MINMN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME REAL SROUNDUP_LWORK EXTERNAL LSAME, SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. - EXTERNAL CGEQRT, CTPQRT, XERBLA + EXTERNAL CGEQRT, CTPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -201,6 +206,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -208,64 +220,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = SROUNDUP_LWORK(NB*N) +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF ( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) - CTR = 1 + CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = MB+1, II-MB+N , (MB-N) + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, $ T(1,CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + IF( II.LE.M ) THEN + CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + $ WORK, INFO ) + END IF * - WORK( 1 ) = SROUNDUP_LWORK(N*NB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of CLATSQR diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f index aee9b3f6a..2a1a2d4dc 100644 --- a/lapack-netlib/SRC/dsytrf.f +++ b/lapack-netlib/SRC/dsytrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -135,7 +135,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -352,6 +352,7 @@ END IF * 40 CONTINUE +* WORK( 1 ) = LWKOPT RETURN * diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F index 32bae26dc..111eaa93e 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.F +++ b/lapack-netlib/SRC/ssytrd_sb2st.F @@ -132,15 +132,17 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is REAL array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is REAL array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension) +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error @@ -152,14 +154,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension LWORK. +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> IF N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension) +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -261,7 +266,7 @@ INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, + $ NBTILES, TTYPE, TID, NTHREADS, $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SISEV, SIZETAU, LDV, LHMIN, LWMIN @@ -283,7 +288,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -292,9 +296,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -315,8 +324,8 @@ END IF * IF( INFO.EQ.0 ) THEN - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + HOUS( 1 ) = SROUNDUP_LWORK( LHMIN ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -544,8 +553,7 @@ 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SSYTRD_SB2ST From 29d6024ec534858294e648d189f9026302a903b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 19:44:11 +0100 Subject: [PATCH 502/718] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/SRC/dgebrd.f | 26 ++++-- lapack-netlib/SRC/dgehrd.f | 24 +++-- lapack-netlib/SRC/dgelq.f | 4 +- lapack-netlib/SRC/dgelqf.f | 20 ++-- lapack-netlib/SRC/dgelsd.f | 7 +- lapack-netlib/SRC/dgemlq.f | 32 ++++--- lapack-netlib/SRC/dgemqr.f | 32 ++++--- lapack-netlib/SRC/dgeqlf.f | 10 +- lapack-netlib/SRC/dgeqp3rk.f | 3 +- lapack-netlib/SRC/dgeqr.f | 20 ++-- lapack-netlib/SRC/dgeqrfp.f | 24 +++-- lapack-netlib/SRC/dgerqf.f | 4 +- lapack-netlib/SRC/dgesvj.f | 32 +++++-- lapack-netlib/SRC/dgetri.f | 5 +- lapack-netlib/SRC/dgetsls.f | 12 ++- lapack-netlib/SRC/dgetsqrhrt.f | 18 ++-- lapack-netlib/SRC/dgges.f | 12 +-- lapack-netlib/SRC/dgges3.f | 39 +++++--- lapack-netlib/SRC/dggev3.f | 37 ++++---- lapack-netlib/SRC/dgghd3.f | 17 ++-- lapack-netlib/SRC/dggqrf.f | 5 +- lapack-netlib/SRC/dggrqf.f | 4 +- lapack-netlib/SRC/dggsvd3.f | 4 +- lapack-netlib/SRC/dggsvp3.f | 4 +- lapack-netlib/SRC/dlamswlq.f | 62 ++++++++----- lapack-netlib/SRC/dlamtsqr.f | 74 ++++++++------- lapack-netlib/SRC/dlaswlq.f | 92 ++++++++++-------- lapack-netlib/SRC/dlatrs3.f | 27 +++++- lapack-netlib/SRC/dlatsqr.f | 92 ++++++++++-------- lapack-netlib/SRC/dsyev_2stage.f | 22 ++--- lapack-netlib/SRC/dsyevd.f | 5 +- lapack-netlib/SRC/dsyevr.f | 21 +++-- lapack-netlib/SRC/dsyevr_2stage.f | 40 ++++---- lapack-netlib/SRC/dsyevx.f | 6 +- lapack-netlib/SRC/dsysv_aa.f | 13 +-- lapack-netlib/SRC/dsysv_aa_2stage.f | 24 ++--- lapack-netlib/SRC/dsysvx.f | 9 +- lapack-netlib/SRC/dsytrd.f | 4 +- lapack-netlib/SRC/dsytrd_2stage.f | 89 ++++++++++-------- lapack-netlib/SRC/dsytrd_sb2st.F | 134 ++++++++++++++------------- lapack-netlib/SRC/dsytrd_sy2sb.f | 18 ++-- lapack-netlib/SRC/dsytrf_aa.f | 27 ++++-- lapack-netlib/SRC/dsytrf_aa_2stage.f | 22 ++--- lapack-netlib/SRC/dsytrf_rk.f | 6 +- lapack-netlib/SRC/dsytrf_rook.f | 4 +- lapack-netlib/SRC/dsytri2.f | 27 +++--- lapack-netlib/SRC/dsytri_3.f | 21 +++-- lapack-netlib/SRC/dsytrs_aa.f | 27 ++++-- 48 files changed, 751 insertions(+), 510 deletions(-) diff --git a/lapack-netlib/SRC/dgebrd.f b/lapack-netlib/SRC/dgebrd.f index 0f0d1651a..ac11d48a0 100644 --- a/lapack-netlib/SRC/dgebrd.f +++ b/lapack-netlib/SRC/dgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL DGEBD2, DGEMM, DLABRD, XERBLA @@ -241,9 +242,17 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + ENDIF WORK( 1 ) = DBLE( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using diff --git a/lapack-netlib/SRC/dgehrd.f b/lapack-netlib/SRC/dgehrd.f index a40c61cb6..d95bbd182 100644 --- a/lapack-netlib/SRC/dgehrd.f +++ b/lapack-netlib/SRC/dgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) + DOUBLE PRECISION A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, $ ONE = 1.0D+0 ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - DOUBLE PRECISION EI + DOUBLE PRECISION EI * .. * .. External Subroutines .. EXTERNAL DAXPY, DGEHD2, DGEMM, DLAHR2, DLARFB, DTRMM, @@ -221,12 +221,18 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + ENDIF WORK( 1 ) = LWKOPT END IF * @@ -248,7 +254,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +273,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -344,6 +349,7 @@ * Use unblocked code to reduce the rest of the matrix * CALL DGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) +* WORK( 1 ) = LWKOPT * RETURN diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f index 013b6c356..255e8732f 100644 --- a/lapack-netlib/SRC/dgelq.f +++ b/lapack-netlib/SRC/dgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/SRC/dgelqf.f b/lapack-netlib/SRC/dgelqf.f index ed3372f96..f0eb00a55 100644 --- a/lapack-netlib/SRC/dgelqf.f +++ b/lapack-netlib/SRC/dgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -174,9 +175,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'DGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +184,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = LWKOPT RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f index b1f45a2c6..7dc564f48 100644 --- a/lapack-netlib/SRC/dgelsd.f +++ b/lapack-netlib/SRC/dgelsd.f @@ -188,7 +188,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup gelsd * *> \par Contributors: * ================== @@ -228,7 +228,7 @@ DOUBLE PRECISION ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM * .. * .. External Subroutines .. - EXTERNAL DGEBRD, DGELQF, DGEQRF, DLABAD, DLACPY, DLALSD, + EXTERNAL DGEBRD, DGELQF, DGEQRF, DLACPY, DLALSD, $ DLASCL, DLASET, DORMBR, DORMLQ, DORMQR, XERBLA * .. * .. External Functions .. @@ -276,7 +276,7 @@ $ LOG( TWO ) ) + 1, 0 ) * IF( INFO.EQ.0 ) THEN - MAXWRK = 0 + MAXWRK = 1 LIWORK = 3*MINMN*NLVL + 11*MINMN MM = M IF( M.GE.N .AND. M.GE.MNTHR ) THEN @@ -372,7 +372,6 @@ SFMIN = DLAMCH( 'S' ) SMLNUM = SFMIN / EPS BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A if max entry outside range [SMLNUM,BIGNUM]. * diff --git a/lapack-netlib/SRC/dgemlq.f b/lapack-netlib/SRC/dgemlq.f index 3ba209105..757683f46 100644 --- a/lapack-netlib/SRC/dgemlq.f +++ b/lapack-netlib/SRC/dgemlq.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,11 +161,13 @@ *> block sizes MB and NB returned by ILAENV, DGELQ will use either *> DLASWLQ (if the matrix is wide-and-short) or DGELQT to compute *> the LQ factorization. -*> This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to +*> This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in DLAMSWLQ or DGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE DGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -186,7 +189,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -202,7 +205,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -217,6 +220,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -245,12 +255,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -262,7 +272,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -275,7 +285,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/dgemqr.f b/lapack-netlib/SRC/dgemqr.f index 022cf21e4..608815483 100644 --- a/lapack-netlib/SRC/dgemqr.f +++ b/lapack-netlib/SRC/dgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -160,12 +161,14 @@ *> block sizes MB and NB returned by ILAENV, DGEQR will use either *> DLATSQR (if the matrix is tall-and-skinny) or DGEQRT to compute *> the QR factorization. -*> This version of DGEMQR will use either DLAMTSQR or DGEMQRT to +*> This version of DGEMQR will use either DLAMTSQR or DGEMQRT to *> multiply matrix Q by another matrix. *> Further Details in DLATMSQR or DGEMQRT. *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE DGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -203,7 +206,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +221,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +256,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +273,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +286,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/dgeqlf.f b/lapack-netlib/SRC/dgeqlf.f index b8ac0b1a0..a72d9dc76 100644 --- a/lapack-netlib/SRC/dgeqlf.f +++ b/lapack-netlib/SRC/dgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is the *> optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -188,8 +189,9 @@ END IF WORK( 1 ) = LWKOPT * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f index 117a68287..b8e41b39c 100644 --- a/lapack-netlib/SRC/dgeqp3rk.f +++ b/lapack-netlib/SRC/dgeqp3rk.f @@ -427,7 +427,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= (3*N + NRHS - 1) +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= (3*N+NRHS-1), otherwise. *> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), *> where NB is the optimal block size for DGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. diff --git a/lapack-netlib/SRC/dgeqr.f b/lapack-netlib/SRC/dgeqr.f index eac8930ce..6ed8f211f 100644 --- a/lapack-netlib/SRC/dgeqr.f +++ b/lapack-netlib/SRC/dgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE DGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -244,8 +246,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +257,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +272,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +286,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +313,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/dgeqrfp.f b/lapack-netlib/SRC/dgeqrfp.f index 46d2ee479..aa757e96c 100644 --- a/lapack-netlib/SRC/dgeqrfp.f +++ b/lapack-netlib/SRC/dgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL DGEQR2P, DLARFB, DLARFT, XERBLA @@ -181,8 +182,16 @@ * INFO = 0 NB = ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +199,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +211,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +218,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. diff --git a/lapack-netlib/SRC/dgerqf.f b/lapack-netlib/SRC/dgerqf.f index cca9d6367..435239cc7 100644 --- a/lapack-netlib/SRC/dgerqf.f +++ b/lapack-netlib/SRC/dgerqf.f @@ -114,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gerqf * *> \par Further Details: * ===================== @@ -189,7 +189,7 @@ END IF WORK( 1 ) = LWKOPT * - IF ( .NOT.LQUERY ) THEN + IF( .NOT.LQUERY ) THEN IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) $ INFO = -7 END IF diff --git a/lapack-netlib/SRC/dgesvj.f b/lapack-netlib/SRC/dgesvj.f index 5fdb21e45..198bfb0a5 100644 --- a/lapack-netlib/SRC/dgesvj.f +++ b/lapack-netlib/SRC/dgesvj.f @@ -208,7 +208,7 @@ *> *> \param[in,out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On entry : *> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. @@ -239,7 +239,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> length of WORK, WORK >= MAX(6,M+N) +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; WORK(1) is set to the minial (and optimal) +*> length of WORK. *> \endverbatim *> *> \param[out] INFO @@ -260,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -365,9 +370,9 @@ INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, - $ SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + $ SWBAND, MINMN, LWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. Local Arrays .. DOUBLE PRECISION FASTR( 5 ) @@ -408,6 +413,14 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 6, M+N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -427,7 +440,7 @@ INFO = -11 ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 ELSE INFO = 0 @@ -437,11 +450,14 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGESVJ', -INFO ) RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = LWMIN + RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/dgetri.f b/lapack-netlib/SRC/dgetri.f index 92ef90c18..7b5a3a1b6 100644 --- a/lapack-netlib/SRC/dgetri.f +++ b/lapack-netlib/SRC/dgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -151,8 +151,9 @@ * INFO = 0 NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f index 25f4c12c2..73b505ff7 100644 --- a/lapack-netlib/SRC/dgetsls.f +++ b/lapack-netlib/SRC/dgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE DGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -189,7 +189,7 @@ * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANGE - EXTERNAL LSAME, DLABAD, DLAMCH, DLANGE + EXTERNAL LSAME, DLAMCH, DLANGE * .. * .. External Subroutines .. EXTERNAL DGEQR, DGEMQR, DLASCL, DLASET, @@ -226,7 +226,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEM = 1 + WSIZEO = 1 + ELSE IF( M.GE.N ) THEN CALL DGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -294,7 +297,6 @@ * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f index 668deeba8..682c7c30f 100644 --- a/lapack-netlib/SRC/dgetsqrhrt.f +++ b/lapack-netlib/SRC/dgetsqrhrt.f @@ -130,14 +130,17 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -212,7 +215,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +228,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +266,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -346,4 +350,4 @@ * * End of DGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/dgges.f b/lapack-netlib/SRC/dgges.f index 31db23715..b9ffc7982 100644 --- a/lapack-netlib/SRC/dgges.f +++ b/lapack-netlib/SRC/dgges.f @@ -234,8 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If N = 0, LWORK >= 1, else LWORK >= 8*N+16. -*> For good performance , LWORK must generally be larger. +*> If N = 0, LWORK >= 1, else LWORK >= MAX(8*N,6*N+16). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup gges * * ===================================================================== SUBROUTINE DGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB, @@ -321,9 +321,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -431,7 +430,6 @@ EPS = DLAMCH( 'P' ) SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dgges3.f b/lapack-netlib/SRC/dgges3.f index 7b00d294a..2ef55951a 100644 --- a/lapack-netlib/SRC/dgges3.f +++ b/lapack-netlib/SRC/dgges3.f @@ -234,6 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK >= 6*N+16. +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -273,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE DGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -309,7 +311,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, LST2SL, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT + $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SAFMAX, SAFMIN, SMLNUM * .. @@ -318,9 +321,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -362,6 +364,12 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + IF( N.EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 6*N+16 + END IF +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -378,7 +386,7 @@ INFO = -15 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -17 - ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * @@ -386,29 +394,33 @@ * IF( INFO.EQ.0 ) THEN CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 6*N+16, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL DORGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) END IF CALL DGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL DLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL DTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR, $ SDIM, PVSL, PVSR, DIF, WORK, -1, IDUM, 1, $ IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = LWKOPT END IF - WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -430,7 +442,6 @@ EPS = DLAMCH( 'P' ) SAFMIN = DLAMCH( 'S' ) SAFMAX = ONE / SAFMIN - CALL DLABAD( SAFMIN, SAFMAX ) SMLNUM = SQRT( SAFMIN ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dggev3.f b/lapack-netlib/SRC/dggev3.f index 4bbe8a40f..b970c04c4 100644 --- a/lapack-netlib/SRC/dggev3.f +++ b/lapack-netlib/SRC/dggev3.f @@ -188,7 +188,9 @@ *> *> \param[in] LWORK *> \verbatim -*> LWORK is INTEGER +*> LWORK is INTEGER. +*> The dimension of the array WORK. LWORK >= MAX(1,8*N). +*> For good performance, LWORK should generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -217,7 +219,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE DGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, @@ -248,7 +250,8 @@ LOGICAL ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, - $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT + $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP * .. @@ -256,9 +259,8 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD, - $ DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, - $ XERBLA + EXTERNAL DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY, + $ DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, XERBLA * .. * .. External Functions .. LOGICAL LSAME @@ -299,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 8*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -313,7 +316,7 @@ INFO = -12 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -14 - ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF * @@ -321,13 +324,13 @@ * IF( INFO.EQ.0 ) THEN CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX(1, 8*N, 3*N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, -1, $ IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVL ) THEN CALL DORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) END IF IF( ILV ) THEN CALL DGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, @@ -336,18 +339,21 @@ CALL DLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) ELSE CALL DGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL DLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = LWKOPT END IF - - WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -367,7 +373,6 @@ EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/dgghd3.f b/lapack-netlib/SRC/dgghd3.f index 43d7a77df..21a668573 100644 --- a/lapack-netlib/SRC/dgghd3.f +++ b/lapack-netlib/SRC/dgghd3.f @@ -179,14 +179,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> -*> \param[in] LWORK +*> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -211,7 +211,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -275,7 +275,12 @@ * INFO = 0 NB = ILAENV( 1, 'DGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = DBLE( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -325,7 +330,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = ONE RETURN @@ -885,6 +889,7 @@ IF ( JCOL.LT.IHI ) $ CALL DGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = DBLE( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f index 39d27a5c9..edac7f22f 100644 --- a/lapack-netlib/SRC/dggqrf.f +++ b/lapack-netlib/SRC/dggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'DGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'DORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN @@ -287,6 +287,7 @@ * RQ factorization of N-by-P matrix B: B = T*Z. * CALL DGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) +* WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) ) * RETURN diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f index ddf4104c5..3b1024c1c 100644 --- a/lapack-netlib/SRC/dggrqf.f +++ b/lapack-netlib/SRC/dggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -249,7 +249,7 @@ NB2 = ILAENV( 1, 'DGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'DORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/dggsvd3.f b/lapack-netlib/SRC/dggsvd3.f index 503f0d8cc..ee4d11e86 100644 --- a/lapack-netlib/SRC/dggsvd3.f +++ b/lapack-netlib/SRC/dggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -328,7 +328,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleGEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/dggsvp3.f b/lapack-netlib/SRC/dggsvp3.f index 4e1db3117..485d95b36 100644 --- a/lapack-netlib/SRC/dggsvp3.f +++ b/lapack-netlib/SRC/dggsvp3.f @@ -227,7 +227,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -250,7 +250,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dlamswlq.f b/lapack-netlib/SRC/dlamswlq.f index 70e78f4b1..07ef1bd57 100644 --- a/lapack-netlib/SRC/dlamswlq.f +++ b/lapack-netlib/SRC/dlamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,29 +192,31 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE DLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, CTR, LW + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, CTR, LW, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -223,52 +228,60 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * INFO = 0 IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -402,7 +415,8 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN +* RETURN * * End of DLAMSWLQ diff --git a/lapack-netlib/SRC/dlamtsqr.f b/lapack-netlib/SRC/dlamtsqr.f index 962a31476..023db5ac9 100644 --- a/lapack-netlib/SRC/dlamtsqr.f +++ b/lapack-netlib/SRC/dlamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) -*> +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,29 +193,31 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE DLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -225,12 +229,13 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +243,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +264,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL DGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +421,8 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN +* RETURN * * End of DLAMTSQR diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f index c95c94cbc..636c12dc8 100644 --- a/lapack-netlib/SRC/dlaswlq.f +++ b/lapack-netlib/SRC/dlaswlq.f @@ -99,19 +99,22 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) -*> +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -159,33 +162,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE DLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, *) + DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL DGELQT, DTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -196,12 +203,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LT.0 ) THEN INFO = -4 @@ -209,60 +223,62 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL DTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF +* + WORK( 1 ) = LWMIN * - WORK( 1 ) = M * MB RETURN * * End of DLASWLQ diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f index e6d78b672..d18675b2d 100644 --- a/lapack-netlib/SRC/dlatrs3.f +++ b/lapack-netlib/SRC/dlatrs3.f @@ -151,13 +151,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal size of *> WORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -165,6 +169,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -181,7 +186,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -253,7 +258,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -292,15 +297,24 @@ * row. WORK( I+KK*LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = LWMIN * * Test the input parameters * @@ -322,7 +336,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -649,6 +663,9 @@ END IF END DO END DO +* + WORK( 1 ) = LWMIN +* RETURN * * End of DLATRS3 diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f index 94a04be02..0000aab68 100644 --- a/lapack-netlib/SRC/dlatsqr.f +++ b/lapack-netlib/SRC/dlatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,27 +164,29 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE DLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ), WORK( * ), T(LDT, *) + DOUBLE PRECISION A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME @@ -198,6 +203,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,65 +217,67 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF +* IF( INFO.NE.0 ) THEN CALL XERBLA( 'DLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF * - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) * - CTR = 1 - DO I = MB+1, II-MB+N , (MB-N) + CTR = 1 + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1, CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = N*NB + WORK( 1 ) = LWMIN RETURN * * End of DLATSQR diff --git a/lapack-netlib/SRC/dsyev_2stage.f b/lapack-netlib/SRC/dsyev_2stage.f index 50d51d992..286366bfe 100644 --- a/lapack-netlib/SRC/dsyev_2stage.f +++ b/lapack-netlib/SRC/dsyev_2stage.f @@ -20,7 +20,7 @@ * Definition: * =========== * -* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, +* SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, * INFO ) * * IMPLICIT NONE @@ -97,7 +97,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension LWORK +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -105,12 +105,12 @@ *> \verbatim *> LWORK is INTEGER *> The length of the array WORK. LWORK >= 1, when N <= 1; -*> otherwise +*> otherwise *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 2*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 2*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -143,7 +143,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heev_2stage * *> \par Further Details: * ===================== @@ -161,7 +161,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -169,16 +169,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * * ===================================================================== - SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, + SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, $ INFO ) * IMPLICIT NONE @@ -305,7 +305,7 @@ LLWORK = LWORK - INDWRK + 1 * CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ), - $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, + $ WORK( INDTAU ), WORK( INDHOUS ), LHTRD, $ WORK( INDWRK ), LLWORK, IINFO ) * * For eigenvalues only, call DSTERF. For eigenvectors, first call diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f index b27f4cdc7..adcfcb373 100644 --- a/lapack-netlib/SRC/dsyevd.f +++ b/lapack-netlib/SRC/dsyevd.f @@ -96,8 +96,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, -*> dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -160,7 +159,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevd * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/dsyevr.f b/lapack-netlib/SRC/dsyevr.f index 698691533..8647b0162 100644 --- a/lapack-netlib/SRC/dsyevr.f +++ b/lapack-netlib/SRC/dsyevr.f @@ -271,7 +271,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,26*N). +*> The dimension of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 26*N. *> For optimal efficiency, LWORK >= (NB+6)*N, *> where NB is the max of the blocksize for DSYTRD and DORMTR *> returned by ILAENV. @@ -285,13 +286,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -315,7 +317,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -390,8 +392,13 @@ * LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) * - LWMIN = MAX( 1, 26*N ) - LIWMIN = MAX( 1, 10*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 26*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -450,7 +457,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 7 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) diff --git a/lapack-netlib/SRC/dsyevr_2stage.f b/lapack-netlib/SRC/dsyevr_2stage.f index 09242bbd3..63d5e3159 100644 --- a/lapack-netlib/SRC/dsyevr_2stage.f +++ b/lapack-netlib/SRC/dsyevr_2stage.f @@ -263,7 +263,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the orthogonal transformations applied by DORMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -277,12 +277,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + 5*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -300,13 +301,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -330,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -358,7 +360,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -366,11 +368,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -444,8 +446,14 @@ IB = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -484,7 +492,7 @@ * NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN END IF * @@ -504,7 +512,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 7 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) @@ -608,7 +616,7 @@ * Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form. * * - CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), + CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), $ WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ), $ LHTRD, WORK( INDWK ), LLWORK, IINFO ) * @@ -727,7 +735,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/dsyevx.f b/lapack-netlib/SRC/dsyevx.f index 99719874b..fd6a78e32 100644 --- a/lapack-netlib/SRC/dsyevx.f +++ b/lapack-netlib/SRC/dsyevx.f @@ -244,7 +244,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYeigen +*> \ingroup heevx * * ===================================================================== SUBROUTINE DSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU, @@ -338,14 +338,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = LWKMIN + LWKOPT = 1 ELSE LWKMIN = 8*N NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = LWKOPT END IF + WORK( 1 ) = LWKOPT * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f index 8dab5a384..0a96ecd7e 100644 --- a/lapack-netlib/SRC/dsysv_aa.f +++ b/lapack-netlib/SRC/dsysv_aa.f @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE DSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS + INTEGER LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS * .. * .. External Functions .. LOGICAL LSAME @@ -196,6 +196,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,17 +207,17 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_SYTRF = INT( WORK(1) ) + LWKOPT_SYTRF = INT( WORK( 1 ) ) CALL DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_SYTRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) + LWKOPT_SYTRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f index 72fbe1e9a..90dd0a38a 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.f +++ b/lapack-netlib/SRC/dsysv_aa_2stage.f @@ -101,14 +101,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is DOUBLE PRECISION array, dimension (LTB) +*> TB is DOUBLE PRECISION array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -148,14 +148,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION workspace of size LWORK +*> WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -179,7 +180,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE DSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -205,7 +206,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME @@ -226,6 +227,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -234,18 +236,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN @@ -255,7 +258,6 @@ RETURN END IF * -* * Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, diff --git a/lapack-netlib/SRC/dsysvx.f b/lapack-netlib/SRC/dsysvx.f index a30831e72..b2b8210ca 100644 --- a/lapack-netlib/SRC/dsysvx.f +++ b/lapack-netlib/SRC/dsysvx.f @@ -275,7 +275,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE DSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -305,7 +305,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB DOUBLE PRECISION ANORM * .. * .. External Functions .. @@ -327,6 +327,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 3*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -344,12 +345,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 3*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/dsytrd.f b/lapack-netlib/SRC/dsytrd.f index 3dcfc3db2..58d4b633b 100644 --- a/lapack-netlib/SRC/dsytrd.f +++ b/lapack-netlib/SRC/dsytrd.f @@ -139,7 +139,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd * *> \par Further Details: * ===================== @@ -247,7 +247,7 @@ * Determine the block size. * NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsytrd_2stage.f b/lapack-netlib/SRC/dsytrd_2stage.f index 8ae77d3e4..a88ac1c73 100644 --- a/lapack-netlib/SRC/dsytrd_2stage.f +++ b/lapack-netlib/SRC/dsytrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download DSYTRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download DSYTRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * DOUBLE PRECISION A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the orthogonal *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is DOUBLE PRECISION array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is DOUBLE PRECISION array, dimension (LHOUS2) +*> HOUS2 is DOUBLE PRECISION array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,23 +145,26 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +270,13 @@ * KD = ILAENV2STAGE( 1, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +317,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRD_SY2SB', -INFO ) RETURN END IF - CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRD_SB2ST', -INFO ) @@ -324,8 +332,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN RETURN * * End of DSYTRD_2STAGE diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F index bb74dd491..04d03d587 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.F +++ b/lapack-netlib/SRC/dsytrd_sb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the dsytrd_sy2sb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the dsytrd_sy2sb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the dsytrd_sy2sb *> routine has been called to produce AB (e.g., AB is *> the output of dsytrd_sy2sb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is DOUBLE PRECISION array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is DOUBLE PRECISION array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension LWORK. +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup real16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * #if defined(_OPENMP) @@ -258,11 +263,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SIDEV, SIZETAU, LDV, LHMIN, LWMIN * .. @@ -274,7 +279,7 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE + INTEGER ILAENV2STAGE EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. @@ -282,7 +287,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -291,9 +295,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -355,7 +364,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -363,11 +372,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* real because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* real because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -382,17 +391,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = ( AB( ABDPOS, I ) ) @@ -413,7 +422,7 @@ RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the symmetric band of A to a tridiagonal matrix. * THGRSIZ = N @@ -422,7 +431,7 @@ NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL DLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -431,7 +440,7 @@ * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -440,7 +449,7 @@ #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -451,7 +460,7 @@ ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -477,16 +486,16 @@ ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -494,20 +503,20 @@ !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW ) ) -#endif +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -522,14 +531,14 @@ !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = ( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -543,11 +552,10 @@ 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * * End of DSYTRD_SB2ST * END - + diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f index 1660b5c7e..38acc71f1 100644 --- a/lapack-netlib/SRC/dsytrd_sy2sb.f +++ b/lapack-netlib/SRC/dsytrd_sy2sb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY) +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -293,8 +295,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', ' ', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f index 9a0b26ce5..924d4c165 100644 --- a/lapack-netlib/SRC/dsytrf_aa.f +++ b/lapack-netlib/SRC/dsytrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB DOUBLE PRECISION ALPHA * .. @@ -179,18 +181,25 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N WORK( 1 ) = LWKOPT END IF * @@ -203,11 +212,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f index c65bd86e6..fae95bab2 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.f +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is DOUBLE PRECISION array, dimension (LTB) +*> TB is DOUBLE PRECISION array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION workspace of size LWORK +*> WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used +*> to select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -211,9 +211,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -227,10 +227,10 @@ NB = ILAENV( 1, 'DSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = MAX( 1, (3*NB+1)*N ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = MAX( 1, N*NB ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -239,7 +239,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f index 086586968..0717eb076 100644 --- a/lapack-netlib/SRC/dsytrf_rk.f +++ b/lapack-netlib/SRC/dsytrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension ( MAX(1,LWORK) ). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dsytrf_rook.f b/lapack-netlib/SRC/dsytrf_rook.f index 2f00d1802..316663485 100644 --- a/lapack-netlib/SRC/dsytrf_rook.f +++ b/lapack-netlib/SRC/dsytrf_rook.f @@ -118,7 +118,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -146,7 +146,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f index dbcdcdb58..5960d3992 100644 --- a/lapack-netlib/SRC/dsytri2.f +++ b/lapack-netlib/SRC/dsytri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3) +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE DSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -159,9 +159,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +177,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = MINSIZE RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL DSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL DSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of DSYTRI2 diff --git a/lapack-netlib/SRC/dsytri_3.f b/lapack-netlib/SRC/dsytri_3.f index 86d69cdfd..50834c605 100644 --- a/lapack-netlib/SRC/dsytri_3.f +++ b/lapack-netlib/SRC/dsytri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3). +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -152,7 +153,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetri_3 * *> \par Contributors: * ================== @@ -208,8 +209,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = LWKOPT * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -217,7 +223,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -225,7 +231,6 @@ CALL XERBLA( 'DSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = LWKOPT RETURN END IF * diff --git a/lapack-netlib/SRC/dsytrs_aa.f b/lapack-netlib/SRC/dsytrs_aa.f index 26b11a2a0..f0016cb7f 100644 --- a/lapack-netlib/SRC/dsytrs_aa.f +++ b/lapack-netlib/SRC/dsytrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -123,7 +129,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleSYcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -151,7 +157,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -161,13 +167,19 @@ EXTERNAL DLACPY, DGTSV, DSWAP, DTRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -178,21 +190,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DSYTRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKMIN RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN From c082669ad46b0f87be6730249b937def5060f80c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 20:05:03 +0100 Subject: [PATCH 503/718] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/SRC/sgebrd.f | 28 +++++--- lapack-netlib/SRC/sgehrd.f | 26 +++++--- lapack-netlib/SRC/sgelq.f | 8 +-- lapack-netlib/SRC/sgelqf.f | 20 ++++-- lapack-netlib/SRC/sgemlq.f | 24 ++++--- lapack-netlib/SRC/sgemqr.f | 22 +++++-- lapack-netlib/SRC/sgeqlf.f | 8 ++- lapack-netlib/SRC/sgeqp3rk.f | 24 +++---- lapack-netlib/SRC/sgeqr.f | 22 ++++--- lapack-netlib/SRC/sgeqrfp.f | 28 +++++--- lapack-netlib/SRC/sgesvj.f | 36 ++++++++--- lapack-netlib/SRC/sgetri.f | 10 +-- lapack-netlib/SRC/sgetsls.f | 7 +- lapack-netlib/SRC/sgetsqrhrt.f | 16 +++-- lapack-netlib/SRC/sgges3.f | 24 +++++-- lapack-netlib/SRC/sggev3.f | 27 +++++--- lapack-netlib/SRC/sgghd3.f | 15 +++-- lapack-netlib/SRC/sggqrf.f | 9 ++- lapack-netlib/SRC/sggrqf.f | 2 +- lapack-netlib/SRC/sggsvd3.f | 2 +- lapack-netlib/SRC/sggsvp3.f | 5 +- lapack-netlib/SRC/slamswlq.f | 64 ++++++++++++------- lapack-netlib/SRC/slamtsqr.f | 76 +++++++++++++--------- lapack-netlib/SRC/slaswlq.f | 92 +++++++++++++++------------ lapack-netlib/SRC/slatrs3.f | 32 +++++++--- lapack-netlib/SRC/slatsqr.f | 95 ++++++++++++++++------------ lapack-netlib/SRC/ssyevd.f | 7 +- lapack-netlib/SRC/ssyevr.f | 19 ++++-- lapack-netlib/SRC/ssyevr_2stage.f | 22 +++++-- lapack-netlib/SRC/ssyevx.f | 6 +- lapack-netlib/SRC/ssysv_aa.f | 18 +++--- lapack-netlib/SRC/ssysv_aa_2stage.f | 27 ++++---- lapack-netlib/SRC/ssysvx.f | 7 +- lapack-netlib/SRC/ssytrd_2stage.f | 86 +++++++++++++------------ lapack-netlib/SRC/ssytrd_sy2sb.f | 18 ++++-- lapack-netlib/SRC/ssytrf.f | 5 +- lapack-netlib/SRC/ssytrf_aa.f | 36 +++++++---- lapack-netlib/SRC/ssytrf_aa_2stage.f | 18 +++--- lapack-netlib/SRC/ssytrf_rk.f | 8 +-- lapack-netlib/SRC/ssytrf_rook.f | 7 +- lapack-netlib/SRC/ssytri2.f | 32 ++++++---- lapack-netlib/SRC/ssytri_3.f | 21 +++--- lapack-netlib/SRC/ssytrs_aa.f | 30 ++++++--- 43 files changed, 671 insertions(+), 418 deletions(-) diff --git a/lapack-netlib/SRC/sgebrd.f b/lapack-netlib/SRC/sgebrd.f index 2d0c6d651..b33ad0b1f 100644 --- a/lapack-netlib/SRC/sgebrd.f +++ b/lapack-netlib/SRC/sgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL SGEBD2, SGEMM, SLABRD, XERBLA @@ -242,9 +243,16 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + ENDIF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -252,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -264,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -283,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using @@ -342,7 +349,8 @@ * CALL SGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ), $ TAUQ( I ), TAUP( I ), WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(WS) +* + WORK( 1 ) = SROUNDUP_LWORK( WS ) RETURN * * End of SGEBRD diff --git a/lapack-netlib/SRC/sgehrd.f b/lapack-netlib/SRC/sgehrd.f index 47733d947..cfa17e156 100644 --- a/lapack-netlib/SRC/sgehrd.f +++ b/lapack-netlib/SRC/sgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - REAL A( LDA, * ), TAU( * ), WORK( * ) + REAL A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, $ ONE = 1.0E+0 ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - REAL EI + REAL EI * .. * .. External Subroutines .. EXTERNAL SAXPY, SGEHD2, SGEMM, SLAHR2, SLARFB, STRMM, @@ -222,13 +222,19 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + ENDIF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -249,7 +255,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -269,7 +274,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of @@ -345,7 +350,8 @@ * Use unblocked code to reduce the rest of the matrix * CALL SGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f index 74c7cc267..75f02675d 100644 --- a/lapack-netlib/SRC/sgelq.f +++ b/lapack-netlib/SRC/sgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -295,9 +295,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -322,7 +322,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWREQ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) RETURN * * End of SGELQ diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f index 1ceec4742..3b3913d84 100644 --- a/lapack-netlib/SRC/sgelqf.f +++ b/lapack-netlib/SRC/sgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -175,9 +176,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -185,19 +185,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -267,7 +273,7 @@ $ CALL SGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGELQF diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f index 83536825c..7e4d9bf65 100644 --- a/lapack-netlib/SRC/sgemlq.f +++ b/lapack-netlib/SRC/sgemlq.f @@ -110,13 +110,14 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this *> value as WORK(1), and no error message related to WORK @@ -187,7 +188,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -207,7 +208,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -222,6 +223,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -250,12 +258,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = SROUNDUP_LWORK( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -267,7 +275,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -280,7 +288,7 @@ $ MB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK( LW ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f index 3207f8bfd..19bf467b8 100644 --- a/lapack-netlib/SRC/sgemqr.f +++ b/lapack-netlib/SRC/sgemqr.f @@ -189,12 +189,13 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMQRT, SLAMTSQR, XERBLA @@ -206,7 +207,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) @@ -221,6 +222,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -249,12 +257,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN @@ -266,7 +274,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -279,7 +287,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = SROUNDUP_LWORK(LW) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqlf.f b/lapack-netlib/SRC/sgeqlf.f index b1266c89e..14942b765 100644 --- a/lapack-netlib/SRC/sgeqlf.f +++ b/lapack-netlib/SRC/sgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is the *> optimal blocksize. *> @@ -189,8 +190,9 @@ END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f index bb5da72dc..d3a335b88 100644 --- a/lapack-netlib/SRC/sgeqp3rk.f +++ b/lapack-netlib/SRC/sgeqp3rk.f @@ -427,7 +427,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= (3*N + NRHS - 1) +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= (3*N+NRHS-1), otherwise. *> For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )), *> where NB is the optimal block size for SGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. @@ -618,8 +619,9 @@ * .. External Functions .. LOGICAL SISNAN INTEGER ISAMAX, ILAENV - REAL SLAMCH, SNRM2 - EXTERNAL SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV + REAL SLAMCH, SNRM2, SROUNDUP_LWORK + EXTERNAL SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV, + $ SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC REAL, MAX, MIN @@ -696,7 +698,7 @@ * LWKOPT = 2*N + NB*( N+NRHS+1 ) END IF - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN INFO = -15 @@ -719,7 +721,7 @@ K = 0 MAXC2NRMK = ZERO RELMAXC2NRMK = ZERO - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -772,7 +774,7 @@ * * Array TAU is not set and contains undefined elements. * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -791,7 +793,7 @@ TAU( J ) = ZERO END DO * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * END IF @@ -822,7 +824,7 @@ DO J = 1, MINMN TAU( J ) = ZERO END DO - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -867,7 +869,7 @@ TAU( J ) = ZERO END DO * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN END IF * @@ -985,7 +987,7 @@ * * Return from the routine. * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * @@ -1072,7 +1074,7 @@ * END IF * - WORK( 1 ) = REAL( LWKOPT ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqr.f b/lapack-netlib/SRC/sgeqr.f index 6f41a92ea..79a515e1c 100644 --- a/lapack-netlib/SRC/sgeqr.f +++ b/lapack-netlib/SRC/sgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE SGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,11 +190,13 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SLATSQR, SGEQRT, XERBLA @@ -244,8 +248,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +259,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +274,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +288,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +315,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = SROUNDUP_LWORK( LWREQ ) * RETURN * diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f index d1ee2a828..37747c512 100644 --- a/lapack-netlib/SRC/sgeqrfp.f +++ b/lapack-netlib/SRC/sgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL SGEQR2P, SLARFB, SLARFT, XERBLA @@ -173,8 +174,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. Executable Statements .. * @@ -182,8 +184,16 @@ * INFO = 0 NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -191,7 +201,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -211,7 +221,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. @@ -273,7 +283,7 @@ $ CALL SGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK, $ IINFO ) * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGEQRFP diff --git a/lapack-netlib/SRC/sgesvj.f b/lapack-netlib/SRC/sgesvj.f index 3f53a5a15..36aed2853 100644 --- a/lapack-netlib/SRC/sgesvj.f +++ b/lapack-netlib/SRC/sgesvj.f @@ -208,7 +208,7 @@ *> *> \param[in,out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On entry, *> If JOBU = 'C' : *> WORK(1) = CTOL, where CTOL defines the threshold for convergence. @@ -239,7 +239,12 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> length of WORK, WORK >= MAX(6,M+N) +*> Length of WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; WORK(1) is set to the minial (and optimal) +*> length of WORK. *> \endverbatim *> *> \param[out] INFO @@ -260,7 +265,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realGEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -351,9 +356,9 @@ INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, - $ SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + $ SWBAND, MINMN, LWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. Local Arrays .. REAL FASTR( 5 ) @@ -369,8 +374,8 @@ INTEGER ISAMAX EXTERNAL ISAMAX * from LAPACK - REAL SLAMCH - EXTERNAL SLAMCH + REAL SLAMCH, SROUNDUP_LWORK + EXTERNAL SLAMCH, SROUNDUP_LWORK LOGICAL LSAME EXTERNAL LSAME * .. @@ -394,6 +399,14 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 6, M+N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -413,7 +426,7 @@ INFO = -11 ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 ELSE INFO = 0 @@ -423,11 +436,14 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGESVJ', -INFO ) RETURN + ELSE IF( LQUERY ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/sgetri.f b/lapack-netlib/SRC/sgetri.f index fe71bc4a5..7b06bb63d 100644 --- a/lapack-netlib/SRC/sgetri.f +++ b/lapack-netlib/SRC/sgetri.f @@ -137,8 +137,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA @@ -152,8 +153,9 @@ * INFO = 0 NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, N*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -251,7 +253,7 @@ $ CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) 60 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(IWS) + WORK( 1 ) = SROUNDUP_LWORK( IWS ) RETURN * * End of SGETRI diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f index d89c6a4e6..08a427a8b 100644 --- a/lapack-netlib/SRC/sgetsls.f +++ b/lapack-netlib/SRC/sgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -226,7 +226,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF( M.GE.N ) THEN CALL SGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f index d80ff4da8..7ade8a66c 100644 --- a/lapack-netlib/SRC/sgetsqrhrt.f +++ b/lapack-netlib/SRC/sgetsqrhrt.f @@ -130,14 +130,17 @@ *> *> \param[in] LWORK *> \verbatim +*> LWORK is INTEGER *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -216,7 +219,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -229,7 +232,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -267,8 +270,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * @@ -350,4 +354,4 @@ * * End of SGETSQRHRT * - END \ No newline at end of file + END diff --git a/lapack-netlib/SRC/sgges3.f b/lapack-netlib/SRC/sgges3.f index e35d4955a..e90cd6947 100644 --- a/lapack-netlib/SRC/sgges3.f +++ b/lapack-netlib/SRC/sgges3.f @@ -234,6 +234,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK >= 6*N+16. +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -309,7 +311,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, LST2SL, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT + $ ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SAFMAX, SAFMIN, SMLNUM * .. @@ -361,6 +364,12 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + IF( N.EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 6*N+16 + END IF +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -377,7 +386,7 @@ INFO = -15 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -17 - ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -19 END IF * @@ -385,7 +394,7 @@ * IF( INFO.EQ.0 ) THEN CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 6*N+16, 3*N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) @@ -407,7 +416,11 @@ $ IERR ) LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -421,6 +434,7 @@ * IF( N.EQ.0 ) THEN SDIM = 0 + WORK( 1 ) = 1 RETURN END IF * @@ -657,7 +671,7 @@ * 40 CONTINUE * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/sggev3.f b/lapack-netlib/SRC/sggev3.f index c82d2187f..d788d1147 100644 --- a/lapack-netlib/SRC/sggev3.f +++ b/lapack-netlib/SRC/sggev3.f @@ -189,6 +189,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. LWORK >= MAX(1,8*N). +*> For good performance, LWORK should generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -248,7 +250,8 @@ LOGICAL ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, - $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT + $ IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT, + $ LWKMIN REAL ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP * .. @@ -298,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 8*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -312,7 +316,7 @@ INFO = -12 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -14 - ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -16 END IF * @@ -320,28 +324,31 @@ * IF( INFO.EQ.0 ) THEN CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, 8*N, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) ) CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL SGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, LDVL, $ VR, LDVR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) IF( ILVL ) THEN CALL SORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) ) CALL SLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) ELSE CALL SLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, $ ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR, $ WORK, -1, 0, IERR ) - LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF - WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) -* END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/sgghd3.f b/lapack-netlib/SRC/sgghd3.f index 9c5858b5a..01e57088a 100644 --- a/lapack-netlib/SRC/sgghd3.f +++ b/lapack-netlib/SRC/sgghd3.f @@ -179,14 +179,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> -*> \param[in] LWORK +*> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -276,7 +276,12 @@ * INFO = 0 NB = ILAENV( 1, 'SGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -326,7 +331,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = ONE RETURN @@ -886,6 +890,7 @@ IF ( JCOL.LT.IHI ) $ CALL SGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f index ebb42a899..d32b48410 100644 --- a/lapack-netlib/SRC/sggqrf.f +++ b/lapack-netlib/SRC/sggqrf.f @@ -236,8 +236,9 @@ * .. * .. External Functions .. INTEGER ILAENV + EXTERNAL ILAENV REAL SROUNDUP_LWORK - EXTERNAL ILAENV, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN @@ -251,8 +252,9 @@ NB2 = ILAENV( 1, 'SGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'SORMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN INFO = -1 @@ -289,6 +291,7 @@ * CALL SGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO ) LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) ) +* WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f index 2163f1ef8..b3842ec2a 100644 --- a/lapack-netlib/SRC/sggrqf.f +++ b/lapack-netlib/SRC/sggrqf.f @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'SGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'SORMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P)*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/sggsvd3.f b/lapack-netlib/SRC/sggsvd3.f index 053fff5de..cee630593 100644 --- a/lapack-netlib/SRC/sggsvd3.f +++ b/lapack-netlib/SRC/sggsvd3.f @@ -278,7 +278,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns diff --git a/lapack-netlib/SRC/sggsvp3.f b/lapack-netlib/SRC/sggsvp3.f index a463b9064..8e90d770c 100644 --- a/lapack-netlib/SRC/sggsvp3.f +++ b/lapack-netlib/SRC/sggsvp3.f @@ -227,7 +227,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -300,8 +300,9 @@ * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGEQP3, SGEQR2, SGERQ2, SLACPY, SLAPMT, diff --git a/lapack-netlib/SRC/slamswlq.f b/lapack-netlib/SRC/slamswlq.f index d4996b1f2..432afaded 100644 --- a/lapack-netlib/SRC/slamswlq.f +++ b/lapack-netlib/SRC/slamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,33 +192,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE SLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + REAL A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL STPMLQT, SGEMLQT, XERBLA * .. @@ -223,52 +231,60 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * INFO = 0 IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -402,7 +418,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLAMSWLQ diff --git a/lapack-netlib/SRC/slamtsqr.f b/lapack-netlib/SRC/slamtsqr.f index 960b794de..f9b167aea 100644 --- a/lapack-netlib/SRC/slamtsqr.f +++ b/lapack-netlib/SRC/slamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) -*> +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,33 +193,38 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE SLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + REAL A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. External Subroutines .. EXTERNAL SGEMQRT, STPMQRT, XERBLA * .. @@ -225,12 +232,13 @@ * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'T' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +246,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +267,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN. AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL SGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +424,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLAMTSQR diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f index 685f823a0..594c646db 100644 --- a/lapack-netlib/SRC/slaswlq.f +++ b/lapack-netlib/SRC/slaswlq.f @@ -96,22 +96,24 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) -*> +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB * M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim + *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -163,32 +165,35 @@ *> * ===================================================================== SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), T( LDT, *) + REAL A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL SGELQT, SGEQRT, STPLQT, STPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -199,12 +204,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -212,60 +224,60 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL STPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = SROUNDUP_LWORK(M * MB) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLASWLQ diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f index 8f0c4bf16..17052289e 100644 --- a/lapack-netlib/SRC/slatrs3.f +++ b/lapack-netlib/SRC/slatrs3.f @@ -151,13 +151,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK). -*> On exit, if INFO = 0, WORK(1) returns the optimal size of -*> WORK. +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -165,6 +168,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -181,7 +185,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -253,7 +257,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -264,7 +268,8 @@ EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM * .. * .. External Subroutines .. - EXTERNAL SLATRS, SSCAL, XERBLA + REAL SROUNDUP_LWORK + EXTERNAL SLATRS, SSCAL, SROUNDUP_LWORK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN @@ -292,15 +297,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * Test the input parameters. * @@ -322,7 +336,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN @@ -650,6 +664,8 @@ END DO END DO RETURN +* + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) * * End of SLATRS3 * diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f index 86733bb15..4730815b5 100644 --- a/lapack-netlib/SRC/slatsqr.f +++ b/lapack-netlib/SRC/slatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,33 +164,39 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE SLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - REAL A( LDA, * ), WORK( * ), T(LDT, *) + REAL A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME + REAL SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL SGEQRT, STPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -198,6 +207,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,64 +221,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) * - CTR = 1 - DO I = MB+1, II-MB+N , (MB-N) + CTR = 1 + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1, CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1, CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SLATSQR diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f index a5e4638d6..2ae44fc81 100644 --- a/lapack-netlib/SRC/ssyevd.f +++ b/lapack-netlib/SRC/ssyevd.f @@ -96,8 +96,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, -*> dimension (LWORK) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -251,7 +250,7 @@ $ N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF - WORK( 1 ) = SROUNDUP_LWORK(LOPT) + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) IWORK( 1 ) = LIOPT * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -335,7 +334,7 @@ IF( ISCALE.EQ.1 ) $ CALL SSCAL( N, ONE / SIGMA, W, 1 ) * - WORK( 1 ) = SROUNDUP_LWORK(LOPT) + WORK( 1 ) = SROUNDUP_LWORK( LOPT ) IWORK( 1 ) = LIOPT * RETURN diff --git a/lapack-netlib/SRC/ssyevr.f b/lapack-netlib/SRC/ssyevr.f index 47e4d7cbf..870facd60 100644 --- a/lapack-netlib/SRC/ssyevr.f +++ b/lapack-netlib/SRC/ssyevr.f @@ -271,7 +271,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,26*N). +*> The dimension of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 26*N. *> For optimal efficiency, LWORK >= (NB+6)*N, *> where NB is the max of the blocksize for SSYTRD and SORMTR *> returned by ILAENV. @@ -292,7 +293,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK and @@ -392,8 +394,13 @@ * LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) ) * - LWMIN = MAX( 1, 26*N ) - LIWMIN = MAX( 1, 10*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 26*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -428,7 +435,7 @@ NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) IWORK( 1 ) = LIWMIN * IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN @@ -677,7 +684,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevr_2stage.f b/lapack-netlib/SRC/ssyevr_2stage.f index a2d6a6231..471e25977 100644 --- a/lapack-netlib/SRC/ssyevr_2stage.f +++ b/lapack-netlib/SRC/ssyevr_2stage.f @@ -278,6 +278,7 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + 5*N @@ -300,13 +301,14 @@ *> \param[out] IWORK *> \verbatim *> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) -*> On exit, if INFO = 0, IWORK(1) returns the optimal LWORK. +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. *> \endverbatim *> *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the IWORK array, @@ -445,8 +447,14 @@ IB = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = MAX( 26*N, 5*N + LHTRD + LWTRD ) + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -485,7 +493,7 @@ * NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) * NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) * LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) IWORK( 1 ) = LIWMIN END IF * @@ -505,7 +513,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 26 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = A( 1, 1 ) @@ -733,7 +741,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) IWORK( 1 ) = LIWMIN * RETURN diff --git a/lapack-netlib/SRC/ssyevx.f b/lapack-netlib/SRC/ssyevx.f index 2204aa39b..aaed6dad5 100644 --- a/lapack-netlib/SRC/ssyevx.f +++ b/lapack-netlib/SRC/ssyevx.f @@ -338,14 +338,14 @@ IF( INFO.EQ.0 ) THEN IF( N.LE.1 ) THEN LWKMIN = 1 - WORK( 1 ) = SROUNDUP_LWORK(LWKMIN) + LWKOPT = 1 ELSE LWKMIN = 8*N NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( LWKMIN, ( NB + 3 )*N ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) $ INFO = -17 @@ -542,7 +542,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f index e43d4de7f..711a275e1 100644 --- a/lapack-netlib/SRC/ssysv_aa.f +++ b/lapack-netlib/SRC/ssysv_aa.f @@ -177,12 +177,13 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS + INTEGER LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL XERBLA, SSYTRS_AA, SSYTRF_AA @@ -196,6 +197,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,18 +208,18 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_SYTRF = INT( WORK(1) ) + LWKOPT_SYTRF = INT( WORK( 1 ) ) CALL SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_SYTRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + LWKOPT_SYTRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -239,7 +241,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f index 3d88e068e..fb068b3bf 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.f +++ b/lapack-netlib/SRC/ssysv_aa_2stage.f @@ -100,14 +100,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is REAL array, dimension (LTB) +*> TB is REAL array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -147,14 +147,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace of size LWORK +*> WORK is REAL workspace of size (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -204,12 +205,13 @@ * .. * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKMIN, LWKOPT * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRF_AA_2STAGE, SSYTRS_AA_2STAGE, @@ -226,6 +228,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -234,18 +237,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -255,7 +259,6 @@ RETURN END IF * -* * Compute the factorization A = U**T*T*U or A = L*T*L**T. * CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2, @@ -269,7 +272,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssysvx.f b/lapack-netlib/SRC/ssysvx.f index 0d72217eb..06a6413f1 100644 --- a/lapack-netlib/SRC/ssysvx.f +++ b/lapack-netlib/SRC/ssysvx.f @@ -305,7 +305,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKMIN, LWKOPT, NB REAL ANORM * .. * .. External Functions .. @@ -327,6 +327,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 3*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -344,12 +345,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 3*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/ssytrd_2stage.f b/lapack-netlib/SRC/ssytrd_2stage.f index 5d70ae0d4..5b401c3d0 100644 --- a/lapack-netlib/SRC/ssytrd_2stage.f +++ b/lapack-netlib/SRC/ssytrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download SSYTRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download SSYTRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * REAL A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the orthogonal *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is REAL array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is REAL array, dimension (LHOUS2) +*> HOUS2 is REAL array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -149,17 +151,19 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +181,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +206,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +214,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +269,13 @@ * KD = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +316,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN END IF - CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRD_SB2ST', -INFO ) @@ -324,8 +331,7 @@ END IF * * - HOUS2( 1 ) = LHMIN - WORK( 1 ) = LWMIN + WORK( 1 ) = LWMIN RETURN * * End of SSYTRD_2STAGE diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f index 4efc43630..3996e07bb 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.f +++ b/lapack-netlib/SRC/ssytrd_sy2sb.f @@ -124,7 +124,7 @@ *> \param[out] WORK *> \verbatim *> WORK is REAL array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY) +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -294,8 +296,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -314,7 +320,7 @@ CALL XERBLA( 'SSYTRD_SY2SB', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN END IF * @@ -507,7 +513,7 @@ END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWMIN) + WORK( 1 ) = SROUNDUP_LWORK( LWMIN ) RETURN * * End of SSYTRD_SY2SB diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f index a788fbcf0..55f3a4f0f 100644 --- a/lapack-netlib/SRC/ssytrf.f +++ b/lapack-netlib/SRC/ssytrf.f @@ -234,7 +234,7 @@ * NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -353,7 +353,8 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f index d6408a978..af32fb064 100644 --- a/lapack-netlib/SRC/ssytrf_aa.f +++ b/lapack-netlib/SRC/ssytrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -128,7 +130,7 @@ *> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -142,19 +144,19 @@ * .. * .. Array Arguments .. INTEGER IPIV( * ) - REAL A( LDA, * ), WORK( * ) + REAL A( LDA, * ), WORK( * ) * .. * * ===================================================================== * .. Parameters .. - REAL ZERO, ONE + REAL ZERO, ONE PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB - REAL ALPHA + REAL ALPHA * .. * .. External Functions .. LOGICAL LSAME @@ -180,19 +182,26 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -204,11 +213,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN RETURN END IF * @@ -458,7 +467,8 @@ END IF * 20 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_AA diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f index abe6564c5..6b5cdee1b 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.f +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f @@ -94,7 +94,7 @@ *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL workspace of size LWORK +*> WORK is REAL workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -212,9 +212,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -228,10 +228,10 @@ NB = ILAENV( 1, 'SSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) ) END IF IF( WQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(N*NB) + WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -240,7 +240,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f index 72830543c..89ecf38fd 100644 --- a/lapack-netlib/SRC/ssytrf_rk.f +++ b/lapack-netlib/SRC/ssytrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension ( MAX(1,LWORK) ). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -312,7 +312,7 @@ * NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -488,7 +488,7 @@ * END IF * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_RK diff --git a/lapack-netlib/SRC/ssytrf_rook.f b/lapack-netlib/SRC/ssytrf_rook.f index 339a229e7..7c2cbbc57 100644 --- a/lapack-netlib/SRC/ssytrf_rook.f +++ b/lapack-netlib/SRC/ssytrf_rook.f @@ -118,7 +118,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -260,7 +260,7 @@ * NB = ILAENV( 1, 'SSYTRF_ROOK', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( 1, N*NB ) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -383,7 +383,8 @@ END IF * 40 CONTINUE - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) +* + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) RETURN * * End of SSYTRF_ROOK diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f index 49f6cad65..fd1c53473 100644 --- a/lapack-netlib/SRC/ssytri2.f +++ b/lapack-netlib/SRC/ssytri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (N+NB+1)*(NB+3) +*> WORK is REAL array, dimension (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup realSYcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE SSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -147,7 +147,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ILAENV - EXTERNAL LSAME, ILAENV + REAL SROUNDUP_LWORK + EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SSYTRI, SSYTRI2X, XERBLA @@ -159,9 +160,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +178,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = SROUNDUP_LWORK( MINSIZE ) RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - - IF( NBMAX .GE. N ) THEN +* + IF( NBMAX.GE.N ) THEN CALL SSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL SSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of SSYTRI2 diff --git a/lapack-netlib/SRC/ssytri_3.f b/lapack-netlib/SRC/ssytri_3.f index bca01105d..f0152a149 100644 --- a/lapack-netlib/SRC/ssytri_3.f +++ b/lapack-netlib/SRC/ssytri_3.f @@ -119,16 +119,17 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (N+NB+1)*(NB+3). +*> WORK is REAL array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= (N+NB+1)*(NB+3). +*> The length of WORK. +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> -*> If LDWORK = -1, then a workspace query is assumed; +*> If LWORK = -1, then a workspace query is assumed; *> the routine only calculates the optimal size of the optimal *> size of the WORK array, returns this value as the first *> entry of the WORK array, and no error message related to @@ -209,8 +210,13 @@ * * Determine the block size * - NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) ) - LWKOPT = ( N+NB+1 ) * ( NB+3 ) + IF( N.EQ.0 ) THEN + LWKOPT = 1 + ELSE + NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) ) + LWKOPT = ( N+NB+1 ) * ( NB+3 ) + END IF + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 @@ -218,7 +224,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN INFO = -8 END IF * @@ -226,7 +232,6 @@ CALL XERBLA( 'SSYTRI_3', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) RETURN END IF * @@ -237,7 +242,7 @@ * CALL SSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO ) * - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKOPT ) * RETURN * diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f index 12fca0c71..265cf0c1d 100644 --- a/lapack-netlib/SRC/ssytrs_aa.f +++ b/lapack-netlib/SRC/ssytrs_aa.f @@ -105,7 +105,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -141,7 +147,7 @@ * .. * .. Array Arguments .. INTEGER IPIV( * ) - REAL A( LDA, * ), B( LDB, * ), WORK( * ) + REAL A( LDA, * ), B( LDB, * ), WORK( * ) * .. * * ===================================================================== @@ -151,24 +157,31 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME + EXTERNAL LSAME REAL SROUNDUP_LWORK - EXTERNAL LSAME, SROUNDUP_LWORK + EXTERNAL SROUNDUP_LWORK * .. * .. External Subroutines .. EXTERNAL SGTSV, SSWAP, SLACPY, STRSM, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +192,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SSYTRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) + WORK( 1 ) = SROUNDUP_LWORK( LWKMIN ) RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN From 45ef0d7361f05d96828534264f0536a492b14180 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 20:16:33 +0100 Subject: [PATCH 504/718] Handle corner cases of LWORK (Reference-LAPACK PR 942) --- lapack-netlib/SRC/zgebrd.f | 26 ++++-- lapack-netlib/SRC/zgehrd.f | 23 +++-- lapack-netlib/SRC/zgelq.f | 4 +- lapack-netlib/SRC/zgelqf.f | 20 ++-- lapack-netlib/SRC/zgemlq.f | 28 ++++-- lapack-netlib/SRC/zgemqr.f | 30 ++++-- lapack-netlib/SRC/zgeqlf.f | 10 +- lapack-netlib/SRC/zgeqp3rk.f | 3 +- lapack-netlib/SRC/zgeqr.f | 20 ++-- lapack-netlib/SRC/zgeqrfp.f | 24 +++-- lapack-netlib/SRC/zgesvj.f | 83 ++++++++++------- lapack-netlib/SRC/zgetri.f | 4 +- lapack-netlib/SRC/zgetsls.f | 12 ++- lapack-netlib/SRC/zgetsqrhrt.f | 15 +-- lapack-netlib/SRC/zgges3.f | 34 ++++--- lapack-netlib/SRC/zggev3.f | 24 +++-- lapack-netlib/SRC/zgghd3.f | 15 ++- lapack-netlib/SRC/zggqrf.f | 4 +- lapack-netlib/SRC/zggrqf.f | 4 +- lapack-netlib/SRC/zggsvd3.f | 4 +- lapack-netlib/SRC/zggsvp3.f | 4 +- lapack-netlib/SRC/zheevd.f | 5 +- lapack-netlib/SRC/zheevr.f | 29 ++++-- lapack-netlib/SRC/zheevr_2stage.f | 42 +++++---- lapack-netlib/SRC/zhesv_aa.f | 15 +-- lapack-netlib/SRC/zhesv_aa_2stage.f | 23 +++-- lapack-netlib/SRC/zhesvx.f | 13 +-- lapack-netlib/SRC/zhetrd_2stage.f | 87 +++++++++-------- lapack-netlib/SRC/zhetrd_hb2st.F | 134 ++++++++++++++------------- lapack-netlib/SRC/zhetrd_he2hb.f | 18 ++-- lapack-netlib/SRC/zhetrf.f | 7 +- lapack-netlib/SRC/zhetrf_aa.f | 27 ++++-- lapack-netlib/SRC/zhetrf_aa_2stage.f | 28 +++--- lapack-netlib/SRC/zhetrf_rk.f | 8 +- lapack-netlib/SRC/zhetrf_rook.f | 4 +- lapack-netlib/SRC/zhetri2.f | 27 +++--- lapack-netlib/SRC/zhetrs_aa.f | 27 ++++-- lapack-netlib/SRC/zlamswlq.f | 68 ++++++++------ lapack-netlib/SRC/zlamtsqr.f | 76 ++++++++------- lapack-netlib/SRC/zlaswlq.f | 91 ++++++++++-------- lapack-netlib/SRC/zlatrs3.f | 22 ++++- lapack-netlib/SRC/zlatsqr.f | 95 +++++++++++-------- 42 files changed, 733 insertions(+), 504 deletions(-) diff --git a/lapack-netlib/SRC/zgebrd.f b/lapack-netlib/SRC/zgebrd.f index f1791c6a4..c1a6169a7 100644 --- a/lapack-netlib/SRC/zgebrd.f +++ b/lapack-netlib/SRC/zgebrd.f @@ -122,7 +122,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,M,N). +*> The length of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise. *> For optimum performance LWORK >= (M+N)*NB, where NB *> is the optimal blocksize. *> @@ -147,7 +148,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gebrd * *> \par Further Details: * ===================== @@ -223,8 +224,8 @@ * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB, - $ NBMIN, NX, WS + INTEGER I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT, + $ MINMN, NB, NBMIN, NX, WS * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEBD2, ZGEMM, ZLABRD @@ -241,9 +242,17 @@ * Test the input parameters * INFO = 0 - NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) ) - LWKOPT = ( M+N )*NB + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = MAX( M, N ) + NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) ) + LWKOPT = ( M+N )*NB + END IF WORK( 1 ) = DBLE( LWKOPT ) +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -251,7 +260,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.LT.0 ) THEN @@ -263,7 +272,6 @@ * * Quick return if possible * - MINMN = MIN( M, N ) IF( MINMN.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -282,7 +290,7 @@ * Determine when to switch from blocked to unblocked code. * IF( NX.LT.MINMN ) THEN - WS = ( M+N )*NB + WS = LWKOPT IF( LWORK.LT.WS ) THEN * * Not enough work space for the optimal NB, consider using diff --git a/lapack-netlib/SRC/zgehrd.f b/lapack-netlib/SRC/zgehrd.f index e18493cf9..0f4424ded 100644 --- a/lapack-netlib/SRC/zgehrd.f +++ b/lapack-netlib/SRC/zgehrd.f @@ -89,7 +89,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gehrd * *> \par Further Details: * ===================== @@ -173,7 +173,7 @@ INTEGER IHI, ILO, INFO, LDA, LWORK, N * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) + COMPLEX*16 A( LDA, * ), TAU( * ), WORK( * ) * .. * * ===================================================================== @@ -182,7 +182,7 @@ INTEGER NBMAX, LDT, TSIZE PARAMETER ( NBMAX = 64, LDT = NBMAX+1, $ TSIZE = LDT*NBMAX ) - COMPLEX*16 ZERO, ONE + COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. @@ -190,7 +190,7 @@ LOGICAL LQUERY INTEGER I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB, $ NBMIN, NH, NX - COMPLEX*16 EI + COMPLEX*16 EI * .. * .. External Subroutines .. EXTERNAL ZAXPY, ZGEHD2, ZGEMM, ZLAHR2, ZLARFB, ZTRMM, @@ -221,12 +221,18 @@ INFO = -8 END IF * + NH = IHI - ILO + 1 IF( INFO.EQ.0 ) THEN * * Compute the workspace requirements * - NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, -1 ) ) - LWKOPT = N*NB + TSIZE + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, + $ -1 ) ) + LWKOPT = N*NB + TSIZE + END IF WORK( 1 ) = LWKOPT ENDIF * @@ -248,7 +254,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = 1 RETURN @@ -268,7 +273,7 @@ * * Determine if workspace is large enough for blocked code * - IF( LWORK.LT.N*NB+TSIZE ) THEN + IF( LWORK.LT.LWKOPT ) THEN * * Not enough workspace to use optimal NB: determine the * minimum value of NB, and reduce NB or force use of diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f index de7c9a378..86610e801 100644 --- a/lapack-netlib/SRC/zgelq.f +++ b/lapack-netlib/SRC/zgelq.f @@ -98,7 +98,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -166,6 +166,8 @@ *> the LQ factorization. *> \endverbatim *> +*> \ingroup gelq +*> * ===================================================================== SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) diff --git a/lapack-netlib/SRC/zgelqf.f b/lapack-netlib/SRC/zgelqf.f index 6c295eece..e988ea818 100644 --- a/lapack-netlib/SRC/zgelqf.f +++ b/lapack-netlib/SRC/zgelqf.f @@ -93,7 +93,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,M). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise. *> For optimum performance LWORK >= M*NB, where NB is the *> optimal blocksize. *> @@ -118,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gelqf * *> \par Further Details: * ===================== @@ -174,9 +175,8 @@ * Test the input arguments * INFO = 0 + K = MIN( M, N ) NB = ILAENV( 1, 'ZGELQF', ' ', M, N, -1, -1 ) - LWKOPT = M*NB - WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -184,19 +184,25 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGELQF', -INFO ) RETURN ELSE IF( LQUERY ) THEN + IF( K.EQ.0 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = M*NB + END IF + WORK( 1 ) = LWKOPT RETURN END IF * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/zgemlq.f b/lapack-netlib/SRC/zgemlq.f index 41cd1c059..11489087a 100644 --- a/lapack-netlib/SRC/zgemlq.f +++ b/lapack-netlib/SRC/zgemlq.f @@ -109,16 +109,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -142,7 +143,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -158,11 +159,13 @@ *> block sizes MB and NB returned by ILAENV, ZGELQ will use either *> ZLASWLQ (if the matrix is wide-and-short) or ZGELQT to compute *> the LQ factorization. -*> This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to +*> This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to *> multiply matrix Q by another matrix. *> Further Details in ZLAMSWLQ or ZGEMLQT. *> \endverbatim *> +*> \ingroup gemlq +*> * ===================================================================== SUBROUTINE ZGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -184,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -200,7 +203,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -215,6 +218,13 @@ LW = M * MB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN @@ -243,7 +253,7 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 END IF * @@ -260,7 +270,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/zgemqr.f b/lapack-netlib/SRC/zgemqr.f index c83eaff2f..d14d74fe2 100644 --- a/lapack-netlib/SRC/zgemqr.f +++ b/lapack-netlib/SRC/zgemqr.f @@ -111,16 +111,17 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1, then a workspace query is assumed. The routine *> only calculates the size of the WORK array, returns this -*> value as WORK(1), and no error message related to WORK +*> value as WORK(1), and no error message related to WORK *> is issued by XERBLA. *> \endverbatim *> @@ -144,7 +145,7 @@ *> *> \verbatim *> -*> These details are particular for this LAPACK implementation. Users should not +*> These details are particular for this LAPACK implementation. Users should not *> take them for granted. These details may change in the future, and are not likely *> true for another LAPACK implementation. These details are relevant if one wants *> to try to understand the code. They are not part of the interface. @@ -166,6 +167,8 @@ *> *> \endverbatim *> +*> \ingroup gemqr +*> * ===================================================================== SUBROUTINE ZGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE, $ C, LDC, WORK, LWORK, INFO ) @@ -187,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER MB, NB, LW, NBLCKS, MN + INTEGER MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME @@ -203,7 +206,7 @@ * * Test the input arguments * - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) @@ -218,6 +221,13 @@ LW = MB * NB MN = N END IF +* + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF * IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN IF( MOD( MN - K, MB - K ).EQ.0 ) THEN @@ -246,12 +256,12 @@ INFO = -9 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN INFO = -11 - ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN - WORK( 1 ) = LW + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN @@ -263,7 +273,7 @@ * * Quick return if possible * - IF( MIN( M, N, K ).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * @@ -276,7 +286,7 @@ $ NB, C, LDC, WORK, LWORK, INFO ) END IF * - WORK( 1 ) = LW + WORK( 1 ) = LWMIN * RETURN * diff --git a/lapack-netlib/SRC/zgeqlf.f b/lapack-netlib/SRC/zgeqlf.f index 94721540c..a27612c64 100644 --- a/lapack-netlib/SRC/zgeqlf.f +++ b/lapack-netlib/SRC/zgeqlf.f @@ -88,7 +88,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -113,7 +114,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup geqlf * *> \par Further Details: * ===================== @@ -188,8 +189,9 @@ END IF WORK( 1 ) = LWKOPT * - IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + IF( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF END IF * diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f index 247a3c379..01dcce0de 100644 --- a/lapack-netlib/SRC/zgeqp3rk.f +++ b/lapack-netlib/SRC/zgeqp3rk.f @@ -428,7 +428,8 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*. LWORK >= N+NRHS-1 +*> LWORK >= 1, if MIN(M,N) = 0, and +*> LWORK >= N+NRHS-1, otherwise. *> For optimal performance LWORK >= NB*( N+NRHS+1 ), *> where NB is the optimal block size for ZGEQP3RK returned *> by ILAENV. Minimal block size MINNB=2. diff --git a/lapack-netlib/SRC/zgeqr.f b/lapack-netlib/SRC/zgeqr.f index 20a80d083..7df9c2403 100644 --- a/lapack-netlib/SRC/zgeqr.f +++ b/lapack-netlib/SRC/zgeqr.f @@ -99,7 +99,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. The routine *> only calculates the sizes of the T and WORK arrays, returns these *> values as the first entries of the T and WORK arrays, and no error @@ -168,6 +168,8 @@ *> *> \endverbatim *> +*> \ingroup geqr +*> * ===================================================================== SUBROUTINE ZGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ INFO ) @@ -188,7 +190,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -244,8 +246,10 @@ * * Determine if the workspace size satisfies minimal size * + LWMIN = MAX( 1, N ) + LWREQ = MAX( 1, N*NB ) LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N ) + IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ ) $ .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN @@ -253,7 +257,7 @@ NB = 1 MB = M END IF - IF( LWORK.LT.NB*N ) THEN + IF( LWORK.LT.LWREQ ) THEN LMINWS = .TRUE. NB = 1 END IF @@ -268,7 +272,7 @@ ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -282,9 +286,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -309,7 +313,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, NB*N ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/zgeqrfp.f b/lapack-netlib/SRC/zgeqrfp.f index 73bcde667..3562de36e 100644 --- a/lapack-netlib/SRC/zgeqrfp.f +++ b/lapack-netlib/SRC/zgeqrfp.f @@ -97,7 +97,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,N). +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise. *> For optimum performance LWORK >= N*NB, where NB is *> the optimal blocksize. *> @@ -122,7 +123,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup geqrfp * *> \par Further Details: * ===================== @@ -162,8 +163,8 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB, - $ NBMIN, NX + INTEGER I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. EXTERNAL XERBLA, ZGEQR2P, ZLARFB, ZLARFT @@ -181,8 +182,16 @@ * INFO = 0 NB = ILAENV( 1, 'ZGEQRF', ' ', M, N, -1, -1 ) - LWKOPT = N*NB + K = MIN( M, N ) + IF( K.EQ.0 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = N + LWKOPT = N*NB + END IF WORK( 1 ) = LWKOPT +* LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 @@ -190,7 +199,7 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF IF( INFO.NE.0 ) THEN @@ -202,7 +211,6 @@ * * Quick return if possible * - K = MIN( M, N ) IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN @@ -210,7 +218,7 @@ * NBMIN = 2 NX = 0 - IWS = N + IWS = LWKMIN IF( NB.GT.1 .AND. NB.LT.K ) THEN * * Determine when to cross over from blocked to unblocked code. diff --git a/lapack-netlib/SRC/zgesvj.f b/lapack-netlib/SRC/zgesvj.f index 6cd2335f2..2be45d826 100644 --- a/lapack-netlib/SRC/zgesvj.f +++ b/lapack-netlib/SRC/zgesvj.f @@ -200,23 +200,25 @@ *> \verbatim *> LDV is INTEGER *> The leading dimension of the array V, LDV >= 1. -*> If JOBV = 'V', then LDV >= max(1,N). -*> If JOBV = 'A', then LDV >= max(1,MV) . +*> If JOBV = 'V', then LDV >= MAX(1,N). +*> If JOBV = 'A', then LDV >= MAX(1,MV) . *> \endverbatim *> *> \param[in,out] CWORK *> \verbatim -*> CWORK is COMPLEX*16 array, dimension (max(1,LWORK)) +*> CWORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> Used as workspace. -*> If on entry LWORK = -1, then a workspace query is assumed and -*> no computation is done; CWORK(1) is set to the minial (and optimal) -*> length of CWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER. -*> Length of CWORK, LWORK >= M+N. +*> Length of CWORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise. +*> +*> If on entry LWORK = -1, then a workspace query is assumed and +*> no computation is done; CWORK(1) is set to the minial (and optimal) +*> length of CWORK. *> \endverbatim *> *> \param[in,out] RWORK @@ -247,15 +249,17 @@ *> RWORK(6) = the largest absolute value over all sines of the *> Jacobi rotation angles in the last sweep. It can be *> useful for a post festum analysis. -*> If on entry LRWORK = -1, then a workspace query is assumed and -*> no computation is done; RWORK(1) is set to the minial (and optimal) -*> length of RWORK. *> \endverbatim *> *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> Length of RWORK, LRWORK >= MAX(6,N). +*> Length of RWORK. +*> LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise. +*> +*> If on entry LRWORK = -1, then a workspace query is assumed and +*> no computation is done; RWORK(1) is set to the minial (and optimal) +*> length of RWORK. *> \endverbatim *> *> \param[out] INFO @@ -276,7 +280,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup gesvj * *> \par Further Details: * ===================== @@ -367,23 +371,25 @@ * * .. Local Parameters .. DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0) - COMPLEX*16 CZERO, CONE - PARAMETER ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) ) - INTEGER NSWEEP - PARAMETER ( NSWEEP = 30 ) + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0) + COMPLEX*16 CZERO, CONE + PARAMETER ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) ) + INTEGER NSWEEP + PARAMETER ( NSWEEP = 30 ) * .. * .. Local Scalars .. - COMPLEX*16 AAPQ, OMPQ - DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, - $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, - $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, - $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL - INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, - $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, - $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND - LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK, - $ RSVEC, UCTOL, UPPER + COMPLEX*16 AAPQ, OMPQ + DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG, + $ BIGTHETA, CS, CTOL, EPSLN, MXAAPQ, + $ MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL, + $ SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, + $ TOL + INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1, + $ ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34, + $ N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, + $ SWBAND, MINMN, LWMIN, LRWMIN + LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, + $ ROTOK, RSVEC, UCTOL, UPPER * .. * .. * .. Intrinsic Functions .. @@ -422,7 +428,16 @@ UPPER = LSAME( JOBA, 'U' ) LOWER = LSAME( JOBA, 'L' ) * - LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 ) + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + LRWMIN = 1 + ELSE + LWMIN = M+N + LRWMIN = MAX( 6, N ) + END IF +* + LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN INFO = -1 ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN @@ -442,9 +457,9 @@ INFO = -11 ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN INFO = -12 - ELSE IF( ( LWORK.LT.( M+N ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -13 - ELSE IF( ( LRWORK.LT.MAX( N, 6 ) ) .AND. ( .NOT.LQUERY ) ) THEN + ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN INFO = -15 ELSE INFO = 0 @@ -454,15 +469,15 @@ IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGESVJ', -INFO ) RETURN - ELSE IF ( LQUERY ) THEN - CWORK(1) = M + N - RWORK(1) = MAX( N, 6 ) + ELSE IF( LQUERY ) THEN + CWORK( 1 ) = LWMIN + RWORK( 1 ) = LRWMIN RETURN END IF * * #:) Quick return for void matrix * - IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN + IF( MINMN.EQ.0 ) RETURN * * Set numerical parameters * The stopping criterion for Jacobi rotations is diff --git a/lapack-netlib/SRC/zgetri.f b/lapack-netlib/SRC/zgetri.f index 41782841c..f3806a77c 100644 --- a/lapack-netlib/SRC/zgetri.f +++ b/lapack-netlib/SRC/zgetri.f @@ -107,7 +107,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEcomputational +*> \ingroup getri * * ===================================================================== SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -152,7 +152,7 @@ * INFO = 0 NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f index 17c6d5146..26311c611 100644 --- a/lapack-netlib/SRC/zgetsls.f +++ b/lapack-netlib/SRC/zgetsls.f @@ -127,7 +127,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> If LWORK = -1 or -2, then a workspace query is assumed. *> If LWORK = -1, the routine calculates optimal size of WORK for the *> optimal performance and returns this value in WORK(1). @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsolve +*> \ingroup getsls * * ===================================================================== SUBROUTINE ZGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, @@ -192,7 +192,7 @@ * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION DLAMCH, ZLANGE - EXTERNAL LSAME, DLABAD, DLAMCH, ZLANGE + EXTERNAL LSAME, DLAMCH, ZLANGE * .. * .. External Subroutines .. EXTERNAL ZGEQR, ZGEMQR, ZLASCL, ZLASET, @@ -229,7 +229,10 @@ * * Determine the optimum and minimum LWORK * - IF( M.GE.N ) THEN + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + WSIZEO = 1 + WSIZEM = 1 + ELSE IF( M.GE.N ) THEN CALL ZGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 ) TSZO = INT( TQ( 1 ) ) LWO = INT( WORKQ( 1 ) ) @@ -297,7 +300,6 @@ * SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) * * Scale A, B if max element outside range [SMLNUM,BIGNUM] * diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f index 5f0167937..e7ce993aa 100644 --- a/lapack-netlib/SRC/zgetsqrhrt.f +++ b/lapack-netlib/SRC/zgetsqrhrt.f @@ -131,13 +131,15 @@ *> \param[in] LWORK *> \verbatim *> The dimension of the array WORK. -*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> If MIN(M,N) = 0, LWORK >= 1, else +*> LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), *> where *> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), *> NB1LOCAL = MIN(NB1,N). *> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, *> LW1 = NB1LOCAL * N, -*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ). +*> *> If LWORK = -1, then a workspace query is assumed. *> The routine only calculates the optimal size of the WORK *> array, returns this value as the first entry of the WORK @@ -160,7 +162,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup comlpex16OTHERcomputational +*> \ingroup getsqrhrt * *> \par Contributors: * ================== @@ -212,7 +214,7 @@ * Test the input arguments * INFO = 0 - LQUERY = LWORK.EQ.-1 + LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. M.LT.N ) THEN @@ -225,7 +227,7 @@ INFO = -5 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -7 - ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN INFO = -9 ELSE * @@ -263,8 +265,9 @@ LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) * LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) + LWORKOPT = MAX( 1, LWORKOPT ) * - IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN INFO = -11 END IF * diff --git a/lapack-netlib/SRC/zgges3.f b/lapack-netlib/SRC/zgges3.f index 8b3e44f88..8235c2543 100644 --- a/lapack-netlib/SRC/zgges3.f +++ b/lapack-netlib/SRC/zgges3.f @@ -215,7 +215,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N) +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -260,7 +261,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEeigen +*> \ingroup gges3 * * ===================================================================== SUBROUTINE ZGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, @@ -300,7 +301,8 @@ LOGICAL CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL, $ LQUERY, WANTST INTEGER I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, - $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT + $ ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT, + $ LWKMIN DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL, $ PVSR, SMLNUM * .. @@ -309,9 +311,8 @@ DOUBLE PRECISION DIF( 2 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, - $ ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, - $ ZUNMQR + EXTERNAL XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0, + $ ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, ZUNMQR * .. * .. External Functions .. LOGICAL LSAME @@ -353,6 +354,8 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) +* IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -369,7 +372,7 @@ INFO = -14 ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN INFO = -16 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * @@ -377,28 +380,32 @@ * IF( INFO.EQ.0 ) THEN CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) ) CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) IF( ILVSL ) THEN CALL ZUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR ) LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) END IF CALL ZGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL, $ LDVSL, VSR, LDVSR, WORK, -1, IERR ) - LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) ) CALL ZLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1, $ RWORK, 0, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) IF( WANTST ) THEN CALL ZTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB, $ ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM, $ PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR ) - LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) ) + LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) ) + END IF + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = DCMPLX( LWKOPT ) END IF - WORK( 1 ) = DCMPLX( LWKOPT ) END IF * IF( INFO.NE.0 ) THEN @@ -420,7 +427,6 @@ EPS = DLAMCH( 'P' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f index 2d6c74582..0cc073470 100644 --- a/lapack-netlib/SRC/zggev3.f +++ b/lapack-netlib/SRC/zggev3.f @@ -174,7 +174,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= MAX(1,2*N). +*> For good performance, LWORK must generally be larger. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -208,7 +209,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEeigen +*> \ingroup ggev3 * * ===================================================================== SUBROUTINE ZGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA, @@ -243,7 +244,7 @@ CHARACTER CHTEMP INTEGER ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO, $ IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR, - $ LWKOPT + $ LWKMIN, LWKOPT DOUBLE PRECISION ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, $ SMLNUM, TEMP COMPLEX*16 X @@ -252,9 +253,8 @@ LOGICAL LDUMMA( 1 ) * .. * .. External Subroutines .. - EXTERNAL DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, - $ ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, - $ ZUNMQR + EXTERNAL XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0, + $ ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, ZUNMQR * .. * .. External Functions .. LOGICAL LSAME @@ -301,6 +301,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( IJOBVL.LE.0 ) THEN INFO = -1 ELSE IF( IJOBVR.LE.0 ) THEN @@ -315,7 +316,7 @@ INFO = -11 ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -15 END IF * @@ -323,7 +324,7 @@ * IF( INFO.EQ.0 ) THEN CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR ) - LWKOPT = MAX( 1, N+INT( WORK( 1 ) ) ) + LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) ) CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK, $ -1, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) @@ -348,7 +349,11 @@ $ RWORK, 0, IERR ) LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) ) END IF - WORK( 1 ) = DCMPLX( LWKOPT ) + IF( N.EQ.0 ) THEN + WORK( 1 ) = 1 + ELSE + WORK( 1 ) = DCMPLX( LWKOPT ) + END IF END IF * IF( INFO.NE.0 ) THEN @@ -368,7 +373,6 @@ EPS = DLAMCH( 'E' )*DLAMCH( 'B' ) SMLNUM = DLAMCH( 'S' ) BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) SMLNUM = SQRT( SMLNUM ) / EPS BIGNUM = ONE / SMLNUM * diff --git a/lapack-netlib/SRC/zgghd3.f b/lapack-netlib/SRC/zgghd3.f index b29cdc70a..08343688d 100644 --- a/lapack-netlib/SRC/zgghd3.f +++ b/lapack-netlib/SRC/zgghd3.f @@ -176,14 +176,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= 1. +*> The length of the array WORK. LWORK >= 1. *> For optimum performance LWORK >= 6*N*NB, where NB is the *> optimal blocksize. *> @@ -208,7 +208,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup gghd3 * *> \par Further Details: * ===================== @@ -275,7 +275,12 @@ * INFO = 0 NB = ILAENV( 1, 'ZGGHD3', ' ', N, ILO, IHI, -1 ) - LWKOPT = MAX( 6*N*NB, 1 ) + NH = IHI - ILO + 1 + IF( NH.LE.1 ) THEN + LWKOPT = 1 + ELSE + LWKOPT = 6*N*NB + END IF WORK( 1 ) = DCMPLX( LWKOPT ) INITQ = LSAME( COMPQ, 'I' ) WANTQ = INITQ .OR. LSAME( COMPQ, 'V' ) @@ -325,7 +330,6 @@ * * Quick return if possible * - NH = IHI - ILO + 1 IF( NH.LE.1 ) THEN WORK( 1 ) = CONE RETURN @@ -883,6 +887,7 @@ IF ( JCOL.LT.IHI ) $ CALL ZGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q, $ LDQ, Z, LDZ, IERR ) +* WORK( 1 ) = DCMPLX( LWKOPT ) * RETURN diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f index 0388b0874..d8636d663 100644 --- a/lapack-netlib/SRC/zggqrf.f +++ b/lapack-netlib/SRC/zggqrf.f @@ -173,7 +173,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggqrf * *> \par Further Details: * ===================== @@ -250,7 +250,7 @@ NB2 = ILAENV( 1, 'ZGERQF', ' ', N, P, -1, -1 ) NB3 = ILAENV( 1, 'ZUNMQR', ' ', N, M, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f index be912c772..69c14af24 100644 --- a/lapack-netlib/SRC/zggrqf.f +++ b/lapack-netlib/SRC/zggrqf.f @@ -172,7 +172,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggrqf * *> \par Further Details: * ===================== @@ -249,7 +249,7 @@ NB2 = ILAENV( 1, 'ZGEQRF', ' ', P, N, -1, -1 ) NB3 = ILAENV( 1, 'ZUNMRQ', ' ', M, N, P, -1 ) NB = MAX( NB1, NB2, NB3 ) - LWKOPT = MAX( N, M, P )*NB + LWKOPT = MAX( 1, MAX( N, M, P )*NB ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zggsvd3.f b/lapack-netlib/SRC/zggsvd3.f index 71257a7c0..40624f5be 100644 --- a/lapack-netlib/SRC/zggsvd3.f +++ b/lapack-netlib/SRC/zggsvd3.f @@ -277,7 +277,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -332,7 +332,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16GEsing +*> \ingroup ggsvd3 * *> \par Contributors: * ================== diff --git a/lapack-netlib/SRC/zggsvp3.f b/lapack-netlib/SRC/zggsvp3.f index f39ccdad3..7b465aaee 100644 --- a/lapack-netlib/SRC/zggsvp3.f +++ b/lapack-netlib/SRC/zggsvp3.f @@ -233,7 +233,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. LWORK >= 1. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -256,7 +256,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup ggsvp3 * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f index ba52f9e72..8e86b9e88 100644 --- a/lapack-netlib/SRC/zheevd.f +++ b/lapack-netlib/SRC/zheevd.f @@ -116,8 +116,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is DOUBLE PRECISION array, -*> dimension (LRWORK) +*> RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK)) *> On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK. *> \endverbatim *> @@ -180,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevd * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zheevr.f b/lapack-netlib/SRC/zheevr.f index 1452e04a3..fe6e1a85f 100644 --- a/lapack-netlib/SRC/zheevr.f +++ b/lapack-netlib/SRC/zheevr.f @@ -272,7 +272,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of the array WORK. LWORK >= max(1,2*N). +*> The length of the array WORK. +*> If N <= 1, LWORK >= 1, else LWORK >= 2*N. *> For optimal efficiency, LWORK >= (NB+1)*N, *> where NB is the max of the blocksize for ZHETRD and for *> ZUNMTR as returned by ILAENV. @@ -294,7 +295,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -313,7 +315,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -338,7 +341,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevr * *> \par Contributors: * ================== @@ -417,9 +420,15 @@ LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR. $ ( LIWORK.EQ.-1 ) ) * - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) - LWMIN = MAX( 1, 2*N ) + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = 2*N + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN @@ -454,7 +463,7 @@ NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) NB = MAX( NB, ILAENV( 1, 'ZUNMTR', UPLO, N, -1, -1, -1 ) ) LWKOPT = MAX( ( NB+1 )*N, LWMIN ) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKOPT RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * @@ -483,7 +492,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = DBLE( A( 1, 1 ) ) @@ -710,7 +719,7 @@ * * Set WORK(1) to optimal workspace size. * - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKOPT RWORK( 1 ) = LRWMIN IWORK( 1 ) = LIWMIN * diff --git a/lapack-netlib/SRC/zheevr_2stage.f b/lapack-netlib/SRC/zheevr_2stage.f index 5c576e633..b1cc7175f 100644 --- a/lapack-netlib/SRC/zheevr_2stage.f +++ b/lapack-netlib/SRC/zheevr_2stage.f @@ -265,7 +265,7 @@ *> indicating the nonzero elements in Z. The i-th eigenvector *> is nonzero only in elements ISUPPZ( 2*i-1 ) through *> ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal -*> matrix). The support of the eigenvectors of A is typically +*> matrix). The support of the eigenvectors of A is typically *> 1:N because of the unitary transformations applied by ZUNMTR. *> Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 *> \endverbatim @@ -279,12 +279,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. +*> The dimension of the array WORK. +*> If N <= 1, LWORK must be at least 1. *> If JOBZ = 'N' and N > 1, LWORK must be queried. *> LWORK = MAX(1, 26*N, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N + N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) *> + (KD+1)*N + N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ @@ -310,7 +311,8 @@ *> \param[in] LRWORK *> \verbatim *> LRWORK is INTEGER -*> The length of the array RWORK. LRWORK >= max(1,24*N). +*> The length of the array RWORK. +*> If N <= 1, LRWORK >= 1, else LRWORK >= 24*N. *> *> If LRWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -329,7 +331,8 @@ *> \param[in] LIWORK *> \verbatim *> LIWORK is INTEGER -*> The dimension of the array IWORK. LIWORK >= max(1,10*N). +*> The dimension of the array IWORK. +*> If N <= 1, LIWORK >= 1, else LIWORK >= 10*N. *> *> If LIWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal sizes of the WORK, RWORK @@ -354,7 +357,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEeigen +*> \ingroup heevr_2stage * *> \par Contributors: * ================== @@ -382,7 +385,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -390,11 +393,11 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim * @@ -472,9 +475,16 @@ IB = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 ) LHTRD = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) LWTRD = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 ) - LWMIN = N + LHTRD + LWTRD - LRWMIN = MAX( 1, 24*N ) - LIWMIN = MAX( 1, 10*N ) +* + IF( N.LE.1 ) THEN + LWMIN = 1 + LRWMIN = 1 + LIWMIN = 1 + ELSE + LWMIN = N + LHTRD + LWTRD + LRWMIN = 24*N + LIWMIN = 10*N + END IF * INFO = 0 IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN @@ -535,7 +545,7 @@ END IF * IF( N.EQ.1 ) THEN - WORK( 1 ) = 2 + WORK( 1 ) = 1 IF( ALLEIG .OR. INDEIG ) THEN M = 1 W( 1 ) = DBLE( A( 1, 1 ) ) @@ -643,9 +653,9 @@ * * Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form. * - CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), + CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), $ RWORK( INDRE ), WORK( INDTAU ), - $ WORK( INDHOUS ), LHTRD, + $ WORK( INDHOUS ), LHTRD, $ WORK( INDWK ), LLWORK, IINFO ) * * If all eigenvalues are desired diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f index df8498c7a..b3d4b3725 100644 --- a/lapack-netlib/SRC/zhesv_aa.f +++ b/lapack-netlib/SRC/zhesv_aa.f @@ -128,7 +128,7 @@ *> LWORK is INTEGER *> The length of WORK. LWORK >= MAX(1,2*N,3*N-2), and for best *> performance LWORK >= max(1,N*NB), where NB is the optimal -*> blocksize for ZHETRF. +*> blocksize for ZHETRF_AA. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -154,7 +154,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesv_aa * * ===================================================================== SUBROUTINE ZHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, @@ -177,7 +177,7 @@ * * .. Local Scalars .. LOGICAL LQUERY - INTEGER LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS + INTEGER LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS * .. * .. External Functions .. LOGICAL LSAME @@ -196,6 +196,7 @@ * INFO = 0 LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N, 3*N-2 ) IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -206,17 +207,17 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT_HETRF = INT( WORK(1) ) + LWKOPT_HETRF = INT( WORK( 1 ) ) CALL ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, $ -1, INFO ) - LWKOPT_HETRS = INT( WORK(1) ) - LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS ) + LWKOPT_HETRS = INT( WORK( 1 ) ) + LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f index 79c01c546..c503b5554 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.f +++ b/lapack-netlib/SRC/zhesv_aa_2stage.f @@ -100,14 +100,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16 array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (MAX(1,LTB)). *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -147,14 +147,15 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -178,7 +179,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesv_aa_2stage * * ===================================================================== SUBROUTINE ZHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB, @@ -208,7 +209,7 @@ * * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY - INTEGER LWKOPT + INTEGER LWKOPT, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -229,6 +230,7 @@ UPPER = LSAME( UPLO, 'U' ) WQUERY = ( LWORK.EQ.-1 ) TQUERY = ( LTB.EQ.-1 ) + LWKMIN = MAX( 1, N ) IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -237,18 +239,19 @@ INFO = -3 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -5 - ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -7 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -11 - ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN INFO = -13 END IF * IF( INFO.EQ.0 ) THEN CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV, $ IPIV2, WORK, -1, INFO ) - LWKOPT = INT( WORK(1) ) + LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) ) + WORK( 1 ) = LWKOPT END IF * IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zhesvx.f b/lapack-netlib/SRC/zhesvx.f index 485c81df6..64aa16674 100644 --- a/lapack-netlib/SRC/zhesvx.f +++ b/lapack-netlib/SRC/zhesvx.f @@ -234,8 +234,8 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= max(1,2*N), and for best -*> performance, when FACT = 'N', LWORK >= max(1,2*N,N*NB), where +*> The length of WORK. LWORK >= MAX(1,2*N), and for best +*> performance, when FACT = 'N', LWORK >= MAX(1,2*N,N*NB), where *> NB is the optimal blocksize for ZHETRF. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -276,7 +276,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEsolve +*> \ingroup hesvx * * ===================================================================== SUBROUTINE ZHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B, @@ -307,7 +307,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, NOFACT - INTEGER LWKOPT, NB + INTEGER LWKOPT, LWKMIN, NB DOUBLE PRECISION ANORM * .. * .. External Functions .. @@ -329,6 +329,7 @@ INFO = 0 NOFACT = LSAME( FACT, 'N' ) LQUERY = ( LWORK.EQ.-1 ) + LWKMIN = MAX( 1, 2*N ) IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN INFO = -1 ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) @@ -346,12 +347,12 @@ INFO = -11 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -13 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -18 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = MAX( 1, 2*N ) + LWKOPT = LWKMIN IF( NOFACT ) THEN NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) LWKOPT = MAX( LWKOPT, N*NB ) diff --git a/lapack-netlib/SRC/zhetrd_2stage.f b/lapack-netlib/SRC/zhetrd_2stage.f index b9d2f0eb1..ab444894b 100644 --- a/lapack-netlib/SRC/zhetrd_2stage.f +++ b/lapack-netlib/SRC/zhetrd_2stage.f @@ -4,23 +4,23 @@ * * =========== DOCUMENTATION =========== * -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ * *> \htmlonly -*> Download ZHETRD_2STAGE + dependencies -*> -*> [TGZ] -*> -*> [ZIP] -*> +*> Download ZHETRD_2STAGE + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> *> [TXT] -*> \endhtmlonly +*> \endhtmlonly * * Definition: * =========== * -* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, +* SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, * HOUS2, LHOUS2, WORK, LWORK, INFO ) * * IMPLICIT NONE @@ -34,7 +34,7 @@ * COMPLEX*16 A( LDA, * ), TAU( * ), * HOUS2( * ), WORK( * ) * .. -* +* * *> \par Purpose: * ============= @@ -52,11 +52,11 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> in particular for the second stage (Band to *> tridiagonal) and thus LHOUS2 is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate Q1 Q2 or to apply Q1 Q2, +*> = 'V': the Householder representation is needed to +*> either generate Q1 Q2 or to apply Q1 Q2, *> then LHOUS2 is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -86,7 +86,7 @@ *> triangular part of A is not referenced. *> On exit, if UPLO = 'U', the band superdiagonal *> of A are overwritten by the corresponding elements of the -*> internal band-diagonal matrix AB, and the elements above +*> internal band-diagonal matrix AB, and the elements above *> the KD superdiagonal, with the array TAU, represent the unitary *> matrix Q1 as a product of elementary reflectors; if UPLO *> = 'L', the diagonal and band subdiagonal of A are over- @@ -117,13 +117,13 @@ *> \param[out] TAU *> \verbatim *> TAU is COMPLEX*16 array, dimension (N-KD) -*> The scalar factors of the elementary reflectors of +*> The scalar factors of the elementary reflectors of *> the first stage (see Further Details). *> \endverbatim *> *> \param[out] HOUS2 *> \verbatim -*> HOUS2 is COMPLEX*16 array, dimension (LHOUS2) +*> HOUS2 is COMPLEX*16 array, dimension (MAX(1,LHOUS2)) *> Stores the Householder representation of the stage2 *> band to tridiagonal. *> \endverbatim @@ -132,6 +132,8 @@ *> \verbatim *> LHOUS2 is INTEGER *> The dimension of the array HOUS2. +*> LHOUS2 >= 1. +*> *> If LWORK = -1, or LHOUS2 = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS2 array, returns @@ -143,23 +145,26 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS2=-1, +*> The dimension of the array WORK. +*> If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS2 = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> LWORK = MAX(1, dimension) where *> dimension = max(stage1,stage2) + (KD+1)*N -*> = N*KD + N*max(KD+1,FACTOPTNB) -*> + max(2*KD*KD, KD*NTHREADS) -*> + (KD+1)*N +*> = N*KD + N*max(KD+1,FACTOPTNB) +*> + max(2*KD*KD, KD*NTHREADS) +*> + (KD+1)*N *> where KD is the blocking size of the reduction, *> FACTOPTNB is the blocking used by the QR or LQ *> algorithm, usually FACTOPTNB=128 is a good choice @@ -177,12 +182,12 @@ * Authors: * ======== * -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrd_2stage * *> \par Further Details: * ===================== @@ -202,7 +207,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -210,16 +215,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, + SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, $ HOUS2, LHOUS2, WORK, LWORK, INFO ) * IMPLICIT NONE @@ -265,10 +270,13 @@ * KD = ILAENV2STAGE( 1, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 ) IB = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) -* WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO, -* $ LHMIN, LWMIN + IF( N.EQ.0 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.LSAME( VECT, 'N' ) ) THEN INFO = -1 @@ -309,14 +317,14 @@ LWRK = LWORK-LDAB*N ABPOS = 1 WPOS = ABPOS + LDAB*N - CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, + CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, $ TAU, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRD_HE2HB', -INFO ) RETURN END IF - CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, - $ WORK( ABPOS ), LDAB, D, E, + CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, + $ WORK( ABPOS ), LDAB, D, E, $ HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO ) IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRD_HB2ST', -INFO ) @@ -324,7 +332,6 @@ END IF * * - HOUS2( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F index 1d39ac942..247497ab6 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.F +++ b/lapack-netlib/SRC/zhetrd_hb2st.F @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, +* SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, * D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * #if defined(_OPENMP) @@ -53,12 +53,12 @@ *> \param[in] STAGE1 *> \verbatim *> STAGE1 is CHARACTER*1 -*> = 'N': "No": to mention that the stage 1 of the reduction +*> = 'N': "No": to mention that the stage 1 of the reduction *> from dense to band using the zhetrd_he2hb routine -*> was not called before this routine to reproduce AB. -*> In other term this routine is called as standalone. -*> = 'Y': "Yes": to mention that the stage 1 of the -*> reduction from dense to band using the zhetrd_he2hb +*> was not called before this routine to reproduce AB. +*> In other term this routine is called as standalone. +*> = 'Y': "Yes": to mention that the stage 1 of the +*> reduction from dense to band using the zhetrd_he2hb *> routine has been called to produce AB (e.g., AB is *> the output of zhetrd_he2hb. *> \endverbatim @@ -66,10 +66,10 @@ *> \param[in] VECT *> \verbatim *> VECT is CHARACTER*1 -*> = 'N': No need for the Housholder representation, +*> = 'N': No need for the Housholder representation, *> and thus LHOUS is of size max(1, 4*N); -*> = 'V': the Householder representation is needed to -*> either generate or to apply Q later on, +*> = 'V': the Householder representation is needed to +*> either generate or to apply Q later on, *> then LHOUS is to be queried and computed. *> (NOT AVAILABLE IN THIS RELEASE). *> \endverbatim @@ -132,34 +132,39 @@ *> *> \param[out] HOUS *> \verbatim -*> HOUS is COMPLEX*16 array, dimension LHOUS, that -*> store the Householder representation. +*> HOUS is COMPLEX*16 array, dimension (MAX(1,LHOUS)) +*> Stores the Householder representation. *> \endverbatim *> *> \param[in] LHOUS *> \verbatim *> LHOUS is INTEGER -*> The dimension of the array HOUS. LHOUS = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array HOUS. +*> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a query is assumed; the routine *> only calculates the optimal size of the HOUS array, returns *> this value as the first entry of the HOUS array, and no error *> message related to LHOUS is issued by XERBLA. *> LHOUS = MAX(1, dimension) where *> dimension = 4*N if VECT='N' -*> not available now if VECT='H' +*> not available now if VECT='H' *> \endverbatim *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension LWORK. +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK = MAX(1, dimension) -*> If LWORK = -1, or LHOUS=-1, +*> The dimension of the array WORK. +*> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). +*> +*> If LWORK = -1, or LHOUS = -1, *> then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -188,7 +193,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERcomputational +*> \ingroup hetrd_hb2st * *> \par Further Details: * ===================== @@ -208,7 +213,7 @@ *> http://doi.acm.org/10.1145/2063384.2063394 *> *> A. Haidar, J. Kurzak, P. Luszczek, 2013. -*> An improved parallel singular value algorithm and its implementation +*> An improved parallel singular value algorithm and its implementation *> for multicore hardware, In Proceedings of 2013 International Conference *> for High Performance Computing, Networking, Storage and Analysis (SC '13). *> Denver, Colorado, USA, 2013. @@ -216,16 +221,16 @@ *> http://doi.acm.org/10.1145/2503210.2503292 *> *> A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra. -*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure +*> A novel hybrid CPU-GPU generalized eigensolver for electronic structure *> calculations based on fine-grained memory aware tasks. *> International Journal of High Performance Computing Applications. *> Volume 28 Issue 2, Pages 196-209, May 2014. -*> http://hpc.sagepub.com/content/28/2/196 +*> http://hpc.sagepub.com/content/28/2/196 *> *> \endverbatim *> * ===================================================================== - SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, + SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, $ D, E, HOUS, LHOUS, WORK, LWORK, INFO ) * * @@ -259,11 +264,11 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, WANTQ, UPPER, AFTERS1 - INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, + INTEGER I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, $ ED, STIND, EDIND, BLKLASTIND, COLPT, THED, $ STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID, - $ NBTILES, TTYPE, TID, NTHREADS, DEBUG, - $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, + $ NBTILES, TTYPE, TID, NTHREADS, + $ ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, $ INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU, $ SIZEV, SIZETAU, LDV, LHMIN, LWMIN DOUBLE PRECISION ABSTMP @@ -277,7 +282,7 @@ * .. * .. External Functions .. LOGICAL LSAME - INTEGER ILAENV2STAGE + INTEGER ILAENV2STAGE EXTERNAL LSAME, ILAENV2STAGE * .. * .. Executable Statements .. @@ -285,7 +290,6 @@ * Determine the minimal workspace size required. * Test the input parameters * - DEBUG = 0 INFO = 0 AFTERS1 = LSAME( STAGE1, 'Y' ) WANTQ = LSAME( VECT, 'V' ) @@ -294,9 +298,14 @@ * * Determine the block size, the workspace size and the hous size. * - IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) - LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + IB = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 ) + IF( N.EQ.0 .OR. KD.LE.1 ) THEN + LHMIN = 1 + LWMIN = 1 + ELSE + LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 ) + END IF * IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN INFO = -1 @@ -358,7 +367,7 @@ ABDPOS = KD + 1 ABOFDPOS = KD ELSE - APOS = INDA + APOS = INDA AWPOS = INDA + KD + 1 DPOS = APOS OFDPOS = DPOS + 1 @@ -366,11 +375,11 @@ ABOFDPOS = 2 ENDIF -* -* Case KD=0: -* The matrix is diagonal. We just copy it (convert to "real" for -* complex because D is double and the imaginary part should be 0) -* and store it in D. A sequential code here is better or +* +* Case KD=0: +* The matrix is diagonal. We just copy it (convert to "real" for +* complex because D is double and the imaginary part should be 0) +* and store it in D. A sequential code here is better or * in a parallel environment it might need two cores for D and E * IF( KD.EQ.0 ) THEN @@ -385,17 +394,17 @@ WORK( 1 ) = 1 RETURN END IF -* -* Case KD=1: -* The matrix is already Tridiagonal. We have to make diagonal +* +* Case KD=1: +* The matrix is already Tridiagonal. We have to make diagonal * and offdiagonal elements real, and store them in D and E. -* For that, for real precision just copy the diag and offdiag -* to D and E while for the COMPLEX case the bulge chasing is -* performed to convert the hermetian tridiagonal to symmetric -* tridiagonal. A simpler conversion formula might be used, but then +* For that, for real precision just copy the diag and offdiag +* to D and E while for the COMPLEX case the bulge chasing is +* performed to convert the hermetian tridiagonal to symmetric +* tridiagonal. A simpler conversion formula might be used, but then * updating the Q matrix will be required and based if Q is generated -* or not this might complicate the story. -* +* or not this might complicate the story. +* IF( KD.EQ.1 ) THEN DO 50 I = 1, N D( I ) = DBLE( AB( ABDPOS, I ) ) @@ -444,7 +453,7 @@ C END IF RETURN END IF * -* Main code start here. +* Main code start here. * Reduce the hermitian band of A to a tridiagonal matrix. * THGRSIZ = N @@ -453,7 +462,7 @@ C END IF NBTILES = CEILING( REAL(N)/REAL(KD) ) STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) ) THGRNB = CEILING( REAL(N-1)/REAL(THGRSIZ) ) -* +* CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA ) CALL ZLASET( "A", KD, N, ZERO, ZERO, WORK( AWPOS ), LDA ) * @@ -462,7 +471,7 @@ C END IF * #if defined(_OPENMP) !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) -!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) +!$OMP$ PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) !$OMP$ PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) !$OMP$ SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) !$OMP$ SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) @@ -471,7 +480,7 @@ C END IF #endif * * main bulge chasing loop -* +* DO 100 THGRID = 1, THGRNB STT = (THGRID-1)*THGRSIZ+1 THED = MIN( (STT + THGRSIZ -1), (N-1)) @@ -482,7 +491,7 @@ C END IF ST = STT DO 130 SWEEPID = ST, ED DO 140 K = 1, GRSIZ - MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) + MYID = (I-SWEEPID)*(STEPERCOL*GRSIZ) $ + (M-1)*GRSIZ + K IF ( MYID.EQ.1 ) THEN TTYPE = 1 @@ -508,17 +517,17 @@ C END IF ENDIF * * Call the kernel -* +* #if defined(_OPENMP) && _OPENMP >= 201307 - IF( TTYPE.NE.1 ) THEN + IF( TTYPE.NE.1 ) THEN !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(in:WORK(MYID-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK @@ -526,20 +535,20 @@ C END IF !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) !$OMP$ DEPEND(out:WORK(MYID)) TID = OMP_GET_THREAD_NUM() - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW + TID*KD ) ) !$OMP END TASK ENDIF #else - CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, + CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, $ STIND, EDIND, SWEEPID, N, KD, IB, - $ WORK ( INDA ), LDA, + $ WORK ( INDA ), LDA, $ HOUS( INDV ), HOUS( INDTAU ), LDV, $ WORK( INDW ) ) -#endif +#endif IF ( BLKLASTIND.GE.(N-1) ) THEN STT = STT + 1 EXIT @@ -554,14 +563,14 @@ C END IF !$OMP END MASTER !$OMP END PARALLEL #endif -* +* * Copy the diagonal from A to D. Note that D is REAL thus only * the Real part is needed, the imaginary part should be zero. * DO 150 I = 1, N D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) ) 150 CONTINUE -* +* * Copy the off diagonal from A to E. Note that E is REAL thus only * the Real part is needed, the imaginary part should be zero. * @@ -575,11 +584,10 @@ C END IF 170 CONTINUE ENDIF * - HOUS( 1 ) = LHMIN WORK( 1 ) = LWMIN RETURN * * End of ZHETRD_HB2ST * END - + diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f index e1b2e1794..3e3bfa374 100644 --- a/lapack-netlib/SRC/zhetrd_he2hb.f +++ b/lapack-netlib/SRC/zhetrd_he2hb.f @@ -123,8 +123,8 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (LWORK) -*> On exit, if INFO = 0, or if LWORK=-1, +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, or if LWORK = -1, *> WORK(1) returns the size of LWORK. *> \endverbatim *> @@ -132,7 +132,9 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK which should be calculated -*> by a workspace query. LWORK = MAX(1, LWORK_QUERY) +*> by a workspace query. +*> If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY). +*> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error @@ -158,7 +160,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrd_he2hb * *> \par Further Details: * ===================== @@ -293,8 +295,12 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) - LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) - + IF( N.LE.KD+1 ) THEN + LWMIN = 1 + ELSE + LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 ) + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN diff --git a/lapack-netlib/SRC/zhetrf.f b/lapack-netlib/SRC/zhetrf.f index 78d4f71b8..433887108 100644 --- a/lapack-netlib/SRC/zhetrf.f +++ b/lapack-netlib/SRC/zhetrf.f @@ -107,7 +107,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> \endverbatim *> @@ -130,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf * *> \par Further Details: * ===================== @@ -227,7 +227,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * @@ -346,6 +346,7 @@ END IF * 40 CONTINUE +* WORK( 1 ) = LWKOPT RETURN * diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f index 537c16e8c..381c87d51 100644 --- a/lapack-netlib/SRC/zhetrf_aa.f +++ b/lapack-netlib/SRC/zhetrf_aa.f @@ -101,8 +101,10 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >= MAX(1,2*N). For optimum performance -*> LWORK >= N*(1+NB), where NB is the optimal blocksize. +*> The length of WORK. +*> LWORK >= 1, if N >= 1, and LWORK >= 2*N, otherwise. +*> For optimum performance LWORK >= N*(1+NB), where NB is +*> the optimal blocksize, returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -125,10 +127,10 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_aa * * ===================================================================== - SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO) + SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -152,7 +154,7 @@ * * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER J, LWKOPT + INTEGER J, LWKMIN, LWKOPT INTEGER NB, MJ, NJ, K1, K2, J1, J2, J3, JB COMPLEX*16 ALPHA * .. @@ -178,18 +180,25 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( N.LE.1 ) THEN + LWKMIN = 1 + LWKOPT = 1 + ELSE + LWKMIN = 2*N + LWKOPT = (NB+1)*N + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -7 END IF * IF( INFO.EQ.0 ) THEN - LWKOPT = (NB+1)*N WORK( 1 ) = LWKOPT END IF * @@ -202,11 +211,11 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF IPIV( 1 ) = 1 - IF ( N.EQ.1 ) THEN + IF( N.EQ.1 ) THEN A( 1, 1 ) = DBLE( A( 1, 1 ) ) RETURN END IF diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f index 477602b5e..bab13a99d 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.f +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f @@ -87,14 +87,14 @@ *> *> \param[out] TB *> \verbatim -*> TB is COMPLEX*16 array, dimension (LTB) +*> TB is COMPLEX*16 array, dimension (MAX(1,LTB)) *> On exit, details of the LU factorization of the band matrix. *> \endverbatim *> *> \param[in] LTB *> \verbatim *> LTB is INTEGER -*> The size of the array TB. LTB >= 4*N, internally +*> The size of the array TB. LTB >= MAX(1,4*N), internally *> used to select NB such that LTB >= (3*NB+1)*N. *> *> If LTB = -1, then a workspace query is assumed; the @@ -121,14 +121,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 workspace of size LWORK +*> WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)) *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The size of WORK. LWORK >= N, internally used to select NB -*> such that LWORK >= N*NB. +*> The size of WORK. LWORK >= MAX(1,N), internally used to +*> select NB such that LWORK >= N*NB. *> *> If LWORK = -1, then a workspace query is assumed; the *> routine only calculates the optimal size of the WORK array, @@ -152,7 +152,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16SYcomputational +*> \ingroup hetrf_aa_2stage * * ===================================================================== SUBROUTINE ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, @@ -182,7 +182,7 @@ * .. Local Scalars .. LOGICAL UPPER, TQUERY, WQUERY INTEGER I, J, K, I1, I2, TD - INTEGER LDTB, NB, KB, JB, NT, IINFO + INTEGER LWKOPT, LDTB, NB, KB, JB, NT, IINFO COMPLEX*16 PIV * .. * .. External Functions .. @@ -212,9 +212,9 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN + ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN INFO = -6 - ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN INFO = -10 END IF * @@ -228,10 +228,10 @@ NB = ILAENV( 1, 'ZHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 ) IF( INFO.EQ.0 ) THEN IF( TQUERY ) THEN - TB( 1 ) = (3*NB+1)*N + TB( 1 ) = MAX( 1, (3*NB+1)*N ) END IF IF( WQUERY ) THEN - WORK( 1 ) = N*NB + WORK( 1 ) = MAX( 1, N*NB ) END IF END IF IF( TQUERY .OR. WQUERY ) THEN @@ -240,7 +240,7 @@ * * Quick return * - IF ( N.EQ.0 ) THEN + IF( N.EQ.0 ) THEN RETURN ENDIF * @@ -392,7 +392,7 @@ CALL ZGETRF( N-(J+1)*NB, NB, $ WORK, N, $ IPIV( (J+1)*NB+1 ), IINFO ) -c IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN +c IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN c INFO = IINFO+(J+1)*NB c END IF * @@ -587,7 +587,7 @@ c END IF CALL ZGETRF( N-(J+1)*NB, NB, $ A( (J+1)*NB+1, J*NB+1 ), LDA, $ IPIV( (J+1)*NB+1 ), IINFO ) -c IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN +c IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN c INFO = IINFO+(J+1)*NB c END IF * diff --git a/lapack-netlib/SRC/zhetrf_rk.f b/lapack-netlib/SRC/zhetrf_rk.f index 73dd9f9d0..7c505fa4d 100644 --- a/lapack-netlib/SRC/zhetrf_rk.f +++ b/lapack-netlib/SRC/zhetrf_rk.f @@ -177,14 +177,14 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension ( MAX(1,LWORK) ). +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). *> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned *> by ILAENV. *> @@ -229,7 +229,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_rk * *> \par Further Details: * ===================== @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZHETRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zhetrf_rook.f b/lapack-netlib/SRC/zhetrf_rook.f index e9de47248..a56349092 100644 --- a/lapack-netlib/SRC/zhetrf_rook.f +++ b/lapack-netlib/SRC/zhetrf_rook.f @@ -122,7 +122,7 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The length of WORK. LWORK >=1. For best performance +*> The length of WORK. LWORK >= 1. For best performance *> LWORK >= N*NB, where NB is the block size returned by ILAENV. *> *> If LWORK = -1, then a workspace query is assumed; the routine @@ -150,7 +150,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrf_rook * *> \par Further Details: * ===================== diff --git a/lapack-netlib/SRC/zhetri2.f b/lapack-netlib/SRC/zhetri2.f index 384745c3a..1d932b866 100644 --- a/lapack-netlib/SRC/zhetri2.f +++ b/lapack-netlib/SRC/zhetri2.f @@ -88,16 +88,16 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is COMPLEX*16 array, dimension (N+NB+1)*(NB+3) +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> WORK is size >= (N+NB+1)*(NB+3) +*> If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3). *> If LWORK = -1, then a workspace query is assumed; the routine -*> calculates: +*> calculates: *> - the optimal size of the WORK array, returns *> this value as the first entry of the WORK array, *> - and no error message related to LWORK is issued by XERBLA. @@ -120,7 +120,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetri2 * * ===================================================================== SUBROUTINE ZHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO ) @@ -159,9 +159,13 @@ INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) +* * Get blocksize +* NBMAX = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 ) - IF ( NBMAX .GE. N ) THEN + IF( N.EQ.0 ) THEN + MINSIZE = 1 + ELSE IF( NBMAX.GE.N ) THEN MINSIZE = N ELSE MINSIZE = (N+NBMAX+1)*(NBMAX+3) @@ -173,28 +177,29 @@ INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN INFO = -7 END IF -* -* Quick return if possible -* * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRI2', -INFO ) RETURN ELSE IF( LQUERY ) THEN - WORK(1)=MINSIZE + WORK( 1 ) = MINSIZE RETURN END IF +* +* Quick return if possible +* IF( N.EQ.0 ) $ RETURN - IF( NBMAX .GE. N ) THEN + IF( NBMAX.GE.N ) THEN CALL ZHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO ) ELSE CALL ZHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO ) END IF +* RETURN * * End of ZHETRI2 diff --git a/lapack-netlib/SRC/zhetrs_aa.f b/lapack-netlib/SRC/zhetrs_aa.f index 06ac1fd28..b7a1f7f07 100644 --- a/lapack-netlib/SRC/zhetrs_aa.f +++ b/lapack-netlib/SRC/zhetrs_aa.f @@ -106,7 +106,13 @@ *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= max(1,3*N-2). +*> The dimension of the array WORK. +*> If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the minimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. *> \endverbatim *> *> \param[out] INFO @@ -124,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16HEcomputational +*> \ingroup hetrs_aa * * ===================================================================== SUBROUTINE ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, @@ -152,7 +158,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, UPPER - INTEGER K, KP, LWKOPT + INTEGER K, KP, LWKMIN * .. * .. External Functions .. LOGICAL LSAME @@ -162,13 +168,19 @@ EXTERNAL ZGTSV, ZSWAP, ZTRSM, ZLACGV, ZLACPY, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX + INTRINSIC MIN, MAX * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) LQUERY = ( LWORK.EQ.-1 ) + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWKMIN = 1 + ELSE + LWKMIN = 3*N-2 + END IF +* IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( N.LT.0 ) THEN @@ -179,21 +191,20 @@ INFO = -5 ELSE IF( LDB.LT.MAX( 1, N ) ) THEN INFO = -8 - ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN + ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN INFO = -10 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZHETRS_AA', -INFO ) RETURN ELSE IF( LQUERY ) THEN - LWKOPT = (3*N-2) - WORK( 1 ) = LWKOPT + WORK( 1 ) = LWKMIN RETURN END IF * * Quick return if possible * - IF( N.EQ.0 .OR. NRHS.EQ.0 ) + IF( MIN( N, NRHS ).EQ.0 ) $ RETURN * IF( UPPER ) THEN diff --git a/lapack-netlib/SRC/zlamswlq.f b/lapack-netlib/SRC/zlamswlq.f index 4abefa434..59a0a5558 100644 --- a/lapack-netlib/SRC/zlamswlq.f +++ b/lapack-netlib/SRC/zlamswlq.f @@ -127,17 +127,20 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. -*> If SIDE = 'L', LWORK >= max(1,NB) * MB; -*> if SIDE = 'R', LWORK >= max(1,M) * MB. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,NB*MB). +*> If SIDE = 'R', LWORK >= max(1,M*MB). +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -189,92 +192,103 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamswlq +*> * ===================================================================== SUBROUTINE ZLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX*16 A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. External Subroutines .. - EXTERNAL ZTPMLQT, ZGEMLQT, XERBLA + EXTERNAL ZTPMLQT, ZGEMLQT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * MB ELSE LW = M * MB END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( K.LT.0 ) THEN INFO = -5 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN INFO = -4 - ELSE IF( K.LT.MB .OR. MB.LT.1) THEN + ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN INFO = -6 ELSE IF( LDA.LT.MAX( 1, K ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, MB) ) THEN + ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN + END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAMSWLQ', -INFO ) - WORK(1) = LW RETURN - ELSE IF (LQUERY) THEN - WORK(1) = LW + ELSE IF( LQUERY ) THEN RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF * IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN CALL ZGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN END IF * @@ -403,7 +417,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN RETURN * * End of ZLAMSWLQ diff --git a/lapack-netlib/SRC/zlamtsqr.f b/lapack-netlib/SRC/zlamtsqr.f index 5030cb75f..03770c06e 100644 --- a/lapack-netlib/SRC/zlamtsqr.f +++ b/lapack-netlib/SRC/zlamtsqr.f @@ -128,22 +128,24 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim +*> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. +*> If MIN(M,N,K) = 0, LWORK >= 1. +*> If SIDE = 'L', LWORK >= max(1,N*NB). +*> If SIDE = 'R', LWORK >= max(1,MB*NB). *> -*> If SIDE = 'L', LWORK >= max(1,N)*NB; -*> if SIDE = 'R', LWORK >= max(1,MB)*NB. *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -191,46 +193,50 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup lamtsqr +*> * ===================================================================== SUBROUTINE ZLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T, - $ LDT, C, LDC, WORK, LWORK, INFO ) + $ LDT, C, LDC, WORK, LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * * .. Scalar Arguments .. - CHARACTER SIDE, TRANS - INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC + CHARACTER SIDE, TRANS + INTEGER INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), C(LDC, * ), - $ T( LDT, * ) + COMPLEX*16 A( LDA, * ), WORK( * ), C( LDC, * ), + $ T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY - INTEGER I, II, KK, LW, CTR, Q + LOGICAL LEFT, RIGHT, TRAN, NOTRAN, LQUERY + INTEGER I, II, KK, LW, CTR, Q, MINMNK, LWMIN * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. External Subroutines .. - EXTERNAL ZGEMQRT, ZTPMQRT, XERBLA + EXTERNAL ZGEMQRT, ZTPMQRT, XERBLA * .. * .. Executable Statements .. * * Test the input arguments * - LQUERY = LWORK.LT.0 + INFO = 0 + LQUERY = ( LWORK.EQ.-1 ) NOTRAN = LSAME( TRANS, 'N' ) TRAN = LSAME( TRANS, 'C' ) LEFT = LSAME( SIDE, 'L' ) RIGHT = LSAME( SIDE, 'R' ) - IF (LEFT) THEN + IF( LEFT ) THEN LW = N * NB Q = M ELSE @@ -238,11 +244,17 @@ Q = N END IF * - INFO = 0 + MINMNK = MIN( M, N, K ) + IF( MINMNK.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = MAX( 1, LW ) + END IF +* IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN - INFO = -1 + INFO = -1 ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN - INFO = -2 + INFO = -2 ELSE IF( M.LT.K ) THEN INFO = -3 ELSE IF( N.LT.0 ) THEN @@ -253,38 +265,38 @@ INFO = -7 ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN INFO = -9 - ELSE IF( LDT.LT.MAX( 1, NB) ) THEN + ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN INFO = -11 ELSE IF( LDC.LT.MAX( 1, M ) ) THEN - INFO = -13 - ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN + INFO = -13 + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -15 END IF * -* Determine the block size if it is tall skinny or short and wide -* - IF( INFO.EQ.0) THEN - WORK(1) = LW + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLAMTSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N,K).EQ.0 ) THEN + IF( MINMNK.EQ.0 ) THEN RETURN END IF +* +* Determine the block size if it is tall skinny or short and wide * IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN CALL ZGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA, - $ T, LDT, C, LDC, WORK, INFO) + $ T, LDT, C, LDC, WORK, INFO ) RETURN - END IF + END IF * IF(LEFT.AND.NOTRAN) THEN * @@ -410,7 +422,7 @@ * END IF * - WORK(1) = LW + WORK( 1 ) = LWMIN RETURN * * End of ZLAMTSQR diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f index be4c48539..735207132 100644 --- a/lapack-netlib/SRC/zlaswlq.f +++ b/lapack-netlib/SRC/zlaswlq.f @@ -96,22 +96,23 @@ *> The leading dimension of the array T. LDT >= MB. *> \endverbatim *> -*> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) -*> +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= MB*M. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. -*> *> \endverbatim +*> *> \param[out] INFO *> \verbatim *> INFO is INTEGER @@ -159,33 +160,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup laswlq +*> * ===================================================================== SUBROUTINE ZLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK, - $ INFO) + $ INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT + INTEGER INFO, LDA, M, N, MB, NB, LWORK, LDT * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, *) + COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, MINMN, LWMIN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. EXTERNAL ZGELQT, ZTPLQT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -196,12 +201,19 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = M*MB + END IF * IF( M.LT.0 ) THEN INFO = -1 ELSE IF( N.LT.0 .OR. N.LT.M ) THEN INFO = -2 - ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN + ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN INFO = -3 ELSE IF( NB.LE.0 ) THEN INFO = -4 @@ -209,60 +221,61 @@ INFO = -6 ELSE IF( LDT.LT.MB ) THEN INFO = -8 - ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = MB*M +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF * IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLASWLQ', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The LQ Decomposition * - IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN - CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO) + IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN + CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO ) RETURN - END IF + END IF * - KK = MOD((N-M),(NB-M)) - II=N-KK+1 + KK = MOD((N-M),(NB-M)) + II = N-KK+1 * -* Compute the LQ factorization of the first block A(1:M,1:NB) +* Compute the LQ factorization of the first block A(1:M,1:NB) * - CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO) - CTR = 1 + CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = NB+1, II-NB+M , (NB-M) + DO I = NB+1, II-NB+M, (NB-M) * -* Compute the QR factorization of the current block A(1:M,I:I+NB-M) +* Compute the QR factorization of the current block A(1:M,I:I+NB-M) * - CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), - $ LDA, T(1, CTR * M + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ), + $ LDA, T(1, CTR * M + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * * Compute the QR factorization of the last block A(1:M,II:N) * - IF (II.LE.N) THEN + IF( II.LE.N ) THEN CALL ZTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ), - $ LDA, T(1, CTR * M + 1), LDT, - $ WORK, INFO ) - END IF + $ LDA, T(1, CTR * M + 1), LDT, + $ WORK, INFO ) + END IF * - WORK( 1 ) = M * MB + WORK( 1 ) = LWMIN RETURN * * End of ZLASWLQ diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f index 231a17274..27eac839b 100644 --- a/lapack-netlib/SRC/zlatrs3.f +++ b/lapack-netlib/SRC/zlatrs3.f @@ -158,7 +158,11 @@ *> \endverbatim *> *> \param[in] LWORK +*> \verbatim *> LWORK is INTEGER +*> The dimension of the array WORK. +*> +*> If MIN(N,NRHS) = 0, LWORK >= 1, else *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where *> NBA = (N + NB - 1)/NB and NB is the optimal block size. *> @@ -166,6 +170,7 @@ *> only calculates the optimal dimensions of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. +*> \endverbatim *> *> \param[out] INFO *> \verbatim @@ -182,7 +187,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup latrs3 *> \par Further Details: * ===================== * \verbatim @@ -257,7 +262,7 @@ LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, - $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, $ SCAMIN, SMLNUM, TMAX * .. @@ -296,15 +301,24 @@ * row. WORK( I + KK * LDS ) is the scale factor of the vector * segment associated with the I-th block row and the KK-th vector * in the block column. +* LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) LDS = NBA +* * The second part stores upper bounds of the triangular A. There are * a total of NBA x NBA blocks, of which only the upper triangular * part or the lower triangular part is referenced. The upper bound of * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). +* LANRM = NBA * NBA AWRK = LSCALE - WORK( 1 ) = LSCALE + LANRM +* + IF( MIN( N, NRHS ).EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = LSCALE + LANRM + END IF + WORK( 1 ) = LWMIN * * Test the input parameters. * @@ -326,7 +340,7 @@ INFO = -8 ELSE IF( LDX.LT.MAX( 1, N ) ) THEN INFO = -10 - ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN INFO = -14 END IF IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f index 8c938aebc..24d00f28a 100644 --- a/lapack-netlib/SRC/zlatsqr.f +++ b/lapack-netlib/SRC/zlatsqr.f @@ -101,15 +101,18 @@ *> *> \param[out] WORK *> \verbatim -*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the minimal LWORK. *> \endverbatim *> *> \param[in] LWORK *> \verbatim *> LWORK is INTEGER -*> The dimension of the array WORK. LWORK >= NB*N. +*> The dimension of the array WORK. +*> LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise. +*> *> If LWORK = -1, then a workspace query is assumed; the routine -*> only calculates the optimal size of the WORK array, returns +*> only calculates the minimal size of the WORK array, returns *> this value as the first entry of the WORK array, and no error *> message related to LWORK is issued by XERBLA. *> \endverbatim @@ -161,33 +164,37 @@ *> SIAM J. Sci. Comput, vol. 34, no. 1, 2012 *> \endverbatim *> +*> \ingroup latsqr +*> * ===================================================================== SUBROUTINE ZLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, - $ LWORK, INFO) + $ LWORK, INFO ) * * -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. -- * * .. Scalar Arguments .. - INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK + INTEGER INFO, LDA, M, N, MB, NB, LDT, LWORK * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, * ), WORK( * ), T(LDT, *) + COMPLEX*16 A( LDA, * ), WORK( * ), T( LDT, * ) * .. * * ===================================================================== * * .. * .. Local Scalars .. - LOGICAL LQUERY - INTEGER I, II, KK, CTR + LOGICAL LQUERY + INTEGER I, II, KK, CTR, LWMIN, MINMN * .. * .. EXTERNAL FUNCTIONS .. LOGICAL LSAME EXTERNAL LSAME +* .. * .. EXTERNAL SUBROUTINES .. - EXTERNAL ZGEQRT, ZTPQRT, XERBLA + EXTERNAL ZGEQRT, ZTPQRT, XERBLA +* .. * .. INTRINSIC FUNCTIONS .. INTRINSIC MAX, MIN, MOD * .. @@ -198,6 +205,13 @@ INFO = 0 * LQUERY = ( LWORK.EQ.-1 ) +* + MINMN = MIN( M, N ) + IF( MINMN.EQ.0 ) THEN + LWMIN = 1 + ELSE + LWMIN = N*NB + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -205,64 +219,65 @@ INFO = -2 ELSE IF( MB.LT.1 ) THEN INFO = -3 - ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN + ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN INFO = -4 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -6 ELSE IF( LDT.LT.NB ) THEN INFO = -8 - ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN + ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN INFO = -10 END IF - IF( INFO.EQ.0) THEN - WORK(1) = NB*N +* + IF( INFO.EQ.0 ) THEN + WORK( 1 ) = LWMIN END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZLATSQR', -INFO ) RETURN - ELSE IF (LQUERY) THEN - RETURN + ELSE IF( LQUERY ) THEN + RETURN END IF * * Quick return if possible * - IF( MIN(M,N).EQ.0 ) THEN - RETURN + IF( MINMN.EQ.0 ) THEN + RETURN END IF * * The QR Decomposition * - IF ((MB.LE.N).OR.(MB.GE.M)) THEN - CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO) - RETURN - END IF - KK = MOD((M-N),(MB-N)) - II=M-KK+1 + IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN + CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO ) + RETURN + END IF + KK = MOD((M-N),(MB-N)) + II = M-KK+1 * -* Compute the QR factorization of the first block A(1:MB,1:N) +* Compute the QR factorization of the first block A(1:MB,1:N) * - CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) - CTR = 1 + CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO ) + CTR = 1 * - DO I = MB+1, II-MB+N , (MB-N) + DO I = MB+1, II-MB+N, (MB-N) * -* Compute the QR factorization of the current block A(I:I+MB-N,1:N) +* Compute the QR factorization of the current block A(I:I+MB-N,1:N) * - CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, - $ T(1, CTR * N + 1), - $ LDT, WORK, INFO ) - CTR = CTR + 1 - END DO + CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA, + $ T(1, CTR * N + 1), + $ LDT, WORK, INFO ) + CTR = CTR + 1 + END DO * -* Compute the QR factorization of the last block A(II:M,1:N) +* Compute the QR factorization of the last block A(II:M,1:N) * - IF (II.LE.M) THEN - CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, - $ T(1,CTR * N + 1), LDT, - $ WORK, INFO ) - END IF + IF( II.LE.M ) THEN + CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA, + $ T(1,CTR * N + 1), LDT, + $ WORK, INFO ) + END IF * - work( 1 ) = N*NB + WORK( 1 ) = LWMIN RETURN * * End of ZLATSQR From 201f22f49ae87f83b8970a257bc580e33cb94f30 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 23:27:38 +0100 Subject: [PATCH 505/718] Fix issues related to ?GEDMD (Reference-LAPACK PR 959) --- lapack-netlib/SRC/cgedmd.f90 | 872 +++++++++++++++++++-------------- lapack-netlib/SRC/dgedmd.f90 | 920 ++++++++++++++++++++--------------- lapack-netlib/SRC/sgedmd.f90 | 916 +++++++++++++++++++--------------- lapack-netlib/SRC/zgedmd.f90 | 848 +++++++++++++++++++------------- 4 files changed, 2084 insertions(+), 1472 deletions(-) diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90 index 499489270..1413130ec 100644 --- a/lapack-netlib/SRC/cgedmd.f90 +++ b/lapack-netlib/SRC/cgedmd.f90 @@ -1,22 +1,526 @@ +!> \brief \b CGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, EIGS, Z, LDZ, RES, B, LDB, & +! W, LDW, S, LDS, ZWORK, LZWORK, & +! RWORK, LRWORK, IWORK, LIWORK, INFO ) +!..... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real32 +! +!..... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LIWORK, LRWORK, LZWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) +! COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) +! REAL(KIND=WP), INTENT(OUT) :: RES(*) +! REAL(KIND=WP), INTENT(OUT) :: RWORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> CGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, CGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, CGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!...................................................................... +! Arguments +! ========= +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim +!..... +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim +!..... +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim +!..... +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim +!..... +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: CGESVD (the QR SVD algorithm) +!> 2 :: CGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim +!..... +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim +!..... +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim +!..... +!> \param[in,out] X +!> \verbatim +!> X (input/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim +!..... +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim +!..... +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim +!..... +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim +!..... +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim +!..... +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim +!..... +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim +!..... +!> \param[out] EIGS +!> \verbatim +!> EIGS (output) COMPLEX(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of EIGS contain +!> the computed eigenvalues (Ritz values). +!> See the descriptions of K, and Z. +!> \endverbatim +!..... +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +!> If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +!> is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +!> If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +!> the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +!> is an eigenvector corresponding to EIGS(i). The columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. +!> See the descriptions of EIGS, X and W. +!> \endverbatim +!..... +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim +!..... +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs, +!> RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +!> See the description of EIGS and Z. +!> \endverbatim +!..... +!> \param[out] B +!> \verbatim +!> B (output) COMPLEX(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1:K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim +!..... +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim +!..... +!> \param[out] W +!> \verbatim +!> W (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient. +!> The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim +!..... +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim +!..... +!> \param[out] S +!> \verbatim +!> S (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by CGEEV. +!> See the description of K. +!> \endverbatim +!..... +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim +!..... +!> \param[out] ZWORK +!> \verbatim +!> ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +!> ZWORK is used as complex workspace in the complex SVD, as +!> specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing +!> the eigenvalues of a Rayleigh quotient. +!> If the call to CGEDMD is only workspace query, then +!> ZWORK(1) contains the minimal complex workspace length and +!> ZWORK(2) is the optimal complex workspace length. +!> Hence, the length of work is at least 2. +!> See the description of LZWORK. +!> \endverbatim +!..... +!> \param[in] LZWORK +!> \verbatim +!> LZWORK (input) INTEGER +!> The minimal length of the workspace vector ZWORK. +!> LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), +!> where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal +!> LZWORK_SVD is calculated as follows +!> If WHTSVD == 1 :: CGESVD :: +!> LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +!> If WHTSVD == 2 :: CGESDD :: +!> LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +!> If WHTSVD == 3 :: CGESVDQ :: +!> LZWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: CGEJSV :: +!> LZWORK_SVD = obtainable by a query +!> If on entry LZWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths and returns them in +!> LZWORK(1) and LZWORK(2), respectively. +!> \endverbatim +!..... +!> \param[out] RWORK +!> \verbatim +!> RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +!> On exit, RWORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +!> scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to CGEDMD is only workspace query, then +!> RWORK(1) contains the minimal workspace length. +!> See the description of LRWORK. +!> \endverbatim +!..... +!> \param[in] LRWORK +!> \verbatim +!> LRWORK (input) INTEGER +!> The minimal length of the workspace vector RWORK. +!> LRWORK is calculated as follows: +!> LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where +!> LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +!> for the SVD subroutine determined by the input parameter +!> WHTSVD. +!> If WHTSVD == 1 :: CGESVD :: +!> LRWORK_SVD = 5*MIN(M,N) +!> If WHTSVD == 2 :: CGESDD :: +!> LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +!> 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +!> If WHTSVD == 3 :: CGESVDQ :: +!> LRWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: CGEJSV :: +!> LRWORK_SVD = obtainable by a query +!> If on entry LRWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> real workspace length and returns it in RWORK(1). +!> \endverbatim +!..... +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim +!..... +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for ZWORK, RWORK and +!> IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \endverbatim +!..... +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! +!............................................................. +!............................................................. SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & M, N, X, LDX, Y, LDY, NRNK, TOL, & K, EIGS, Z, LDZ, RES, B, LDB, & W, LDW, S, LDS, ZWORK, LZWORK, & RWORK, LRWORK, IWORK, LIWORK, INFO ) -! March 2023 +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! !..... USE iso_fortran_env IMPLICIT NONE INTEGER, PARAMETER :: WP = real32 -!..... +! ! Scalar arguments +! ~~~~~~~~~~~~~~~~ CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & NRNK, LDZ, LDB, LDW, LDS, & LIWORK, LRWORK, LZWORK INTEGER, INTENT(OUT) :: K, INFO REAL(KIND=WP), INTENT(IN) :: TOL +! ! Array arguments +! ~~~~~~~~~~~~~~~ COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & W(LDW,*), S(LDS,*) @@ -25,364 +529,14 @@ REAL(KIND=WP), INTENT(OUT) :: RES(*) REAL(KIND=WP), INTENT(OUT) :: RWORK(*) INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! CGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, CGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, CGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. ! -!...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ -!...................................................................... -! Arguments -! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. -!..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. -!..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. -!..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. -!..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: CGESVD (the QR SVD algorithm) -! 2 :: CGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: CGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. -!..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). -!..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). -!..... -! X (input/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. -!..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. -!..... -! Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. -!..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. -!..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. -!..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. -!..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. -!..... -! EIGS (output) COMPLEX(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of EIGS contain -! the computed eigenvalues (Ritz values). -! See the descriptions of K, and Z. -!..... -! Z (workspace/output) COMPLEX(KIND=WP) M-by-N array -! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) -! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. -! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as -! the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) -! is an eigenvector corresponding to EIGS(i). The columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. -! See the descriptions of EIGS, X and W. -!..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. -!..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs, -! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. -! See the description of EIGS and Z. -!..... -! B (output) COMPLEX(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1:K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. -!..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. -!..... -! W (workspace/output) COMPLEX(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient. -! The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. -!..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. -!..... -! S (workspace/output) COMPLEX(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by CGEEV. -! See the description of K. -!..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. -!..... -! ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array -! ZWORK is used as complex workspace in the complex SVD, as -! specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing -! the eigenvalues of a Rayleigh quotient. -! If the call to CGEDMD is only workspace query, then -! ZWORK(1) contains the minimal complex workspace length and -! ZWORK(2) is the optimal complex workspace length. -! Hence, the length of work is at least 2. -! See the description of LZWORK. -!..... -! LZWORK (input) INTEGER -! The minimal length of the workspace vector ZWORK. -! LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), -! where LZWORK_CGEEV = MAX( 1, 2*N ) and the minimal -! LZWORK_SVD is calculated as follows -! If WHTSVD == 1 :: CGESVD :: -! LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) -! If WHTSVD == 2 :: CGESDD :: -! LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) -! If WHTSVD == 3 :: CGESVDQ :: -! LZWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: CGEJSV :: -! LZWORK_SVD = obtainable by a query -! If on entry LZWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths and returns them in -! LZWORK(1) and LZWORK(2), respectively. -!..... -! RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array -! On exit, RWORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain -! scaling factor RWORK(N+2)/RWORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to CGEDMD is only workspace query, then -! RWORK(1) contains the minimal workspace length. -! See the description of LRWORK. -!..... -! LRWORK (input) INTEGER -! The minimal length of the workspace vector RWORK. -! LRWORK is calculated as follows: -! LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where -! LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace -! for the SVD subroutine determined by the input parameter -! WHTSVD. -! If WHTSVD == 1 :: CGESVD :: -! LRWORK_SVD = 5*MIN(M,N) -! If WHTSVD == 2 :: CGESDD :: -! LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), -! 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) -! If WHTSVD == 3 :: CGESVDQ :: -! LRWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: CGEJSV :: -! LRWORK_SVD = obtainable by a query -! If on entry LRWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! real workspace length and returns it in RWORK(1). -!..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. -!..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for ZWORK, RWORK and -! IWORK. See the descriptions of ZWORK, RWORK and IWORK. -!..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. -!............................................................. -!............................................................. ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -400,7 +554,7 @@ ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: RDUMMY(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2 @@ -408,13 +562,13 @@ INTEGER ICAMAX LOGICAL SISNAN, LSAME EXTERNAL SISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL CAXPY, CGEMM, CSSCAL EXTERNAL CGEEV, CGEJSV, CGESDD, CGESVD, CGESVDQ, & CLACPY, CLASCL, CLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC FLOAT, INT, MAX, SQRT @@ -607,7 +761,8 @@ K = 0 DO i = 1, N !WORK(i) = SCNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 @@ -680,7 +835,8 @@ ! carefully computed using CLASSQ. DO i = 1, N !RWORK(i) = SCNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90 index 20424808f..15df48fe9 100644 --- a/lapack-netlib/SRC/dgedmd.f90 +++ b/lapack-netlib/SRC/dgedmd.f90 @@ -1,424 +1,574 @@ - SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, REIG, IMEIG, Z, LDZ, RES, & - B, LDB, W, LDW, S, LDS, & - WORK, LWORK, IWORK, LIWORK, INFO ) -! March 2023 +!> \brief \b DGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, REIG, IMEIG, Z, LDZ, RES, & +! B, LDB, W, LDW, S, LDS, & +! WORK, LWORK, IWORK, LIWORK, INFO ) +! !..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real64 +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real64 !..... ! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LWORK, LIWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LWORK, LIWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL ! Array arguments - REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & - RES(*) - REAL(KIND=WP), INTENT(OUT) :: WORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! DGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, DGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, DGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +! REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & +! RES(*) +! REAL(KIND=WP), INTENT(OUT) :: WORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) ! -!...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ !............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> DGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, DGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, DGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim +!...................................................................... +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!...................................................................... ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) is CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: DGESVD (the QR SVD algorithm) -! 2 :: DGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: DGESVD (the QR SVD algorithm) +!> 2 :: DGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: DGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) REAL(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in,out] X +!> \verbatim +!> X (input/output) REAL(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) REAL(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) REAL(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1). -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1). +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! REIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of REIG contain -! the real parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! See the descriptions of K, IMEIG, and Z. +!> \param[out] REIG +!> \verbatim +!> REIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of REIG contain +!> the real parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> See the descriptions of K, IMEIG, and Z. +!> \endverbatim !..... -! IMEIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of IMEIG contain -! the imaginary parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! The eigenvalues are determined as follows: -! If IMEIG(i) == 0, then the corresponding eigenvalue is -! real, LAMBDA(i) = REIG(i). -! If IMEIG(i)>0, then the corresponding complex -! conjugate pair of eigenvalues reads -! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) -! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) -! That is, complex conjugate pairs have consecutive -! indices (i,i+1), with the positive imaginary part -! listed first. -! See the descriptions of K, REIG, and Z. +!> \param[out] IMEIG +!> \verbatim +!> IMEIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of IMEIG contain +!> the imaginary parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> The eigenvalues are determined as follows: +!> If IMEIG(i) == 0, then the corresponding eigenvalue is +!> real, LAMBDA(i) = REIG(i). +!> If IMEIG(i)>0, then the corresponding complex +!> conjugate pair of eigenvalues reads +!> LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +!> LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +!> That is, complex conjugate pairs have consecutive +!> indices (i,i+1), with the positive imaginary part +!> listed first. +!> See the descriptions of K, REIG, and Z. +!> \endverbatim !..... -! Z (workspace/output) REAL(KIND=WP) M-by-N array -! If JOBZ =='V' then -! Z contains real Ritz vectors as follows: -! If IMEIG(i)=0, then Z(:,i) is an eigenvector of -! the i-th Ritz value; ||Z(:,i)||_2=1. -! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then -! [Z(:,i) Z(:,i+1)] span an invariant subspace and -! the Ritz values extracted from this subspace are -! REIG(i) + sqrt(-1)*IMEIG(i) and -! REIG(i) - sqrt(-1)*IMEIG(i). -! The corresponding eigenvectors are -! Z(:,i) + sqrt(-1)*Z(:,i+1) and -! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. -! || Z(:,i:i+1)||_F = 1. -! If JOBZ == 'F', then the above descriptions hold for -! the columns of X(:,1:K)*W(1:K,1:K), where the columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. The columns of W(1:K,1:K) -! are similarly structured: If IMEIG(i) == 0 then -! X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 -! then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and -! X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) -! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). -! See the descriptions of REIG, IMEIG, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) REAL(KIND=WP) M-by-N array +!> If JOBZ =='V' then +!> Z contains real Ritz vectors as follows: +!> If IMEIG(i)=0, then Z(:,i) is an eigenvector of +!> the i-th Ritz value; ||Z(:,i)||_2=1. +!> If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +!> [Z(:,i) Z(:,i+1)] span an invariant subspace and +!> the Ritz values extracted from this subspace are +!> REIG(i) + sqrt(-1)*IMEIG(i) and +!> REIG(i) - sqrt(-1)*IMEIG(i). +!> The corresponding eigenvectors are +!> Z(:,i) + sqrt(-1)*Z(:,i+1) and +!> Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +!> || Z(:,i:i+1)||_F = 1. +!> If JOBZ == 'F', then the above descriptions hold for +!> the columns of X(:,1:K)*W(1:K,1:K), where the columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +!> are similarly structured: If IMEIG(i) == 0 then +!> X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +!> then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +!> X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +!> are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +!> See the descriptions of REIG, IMEIG, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs. -! If LAMBDA(i) is real, then -! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. -! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair -! then -! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F -! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] -! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. -! It holds that -! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 -! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 -! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) -! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) -! See the description of REIG, IMEIG and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs. +!> If LAMBDA(i) is real, then +!> RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +!> If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +!> then +!> RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +!> where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +!> [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +!> It holds that +!> RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +!> RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +!> where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +!> ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +!> See the description of REIG, IMEIG and Z. +!> \endverbatim !..... -! B (output) REAL(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1;K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) REAL(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1;K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) REAL(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient (real and -! imaginary parts for each complex conjugate pair of the -! eigenvalues). The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) REAL(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient (real and +!> imaginary parts for each complex conjugate pair of the +!> eigenvalues). The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) REAL(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by DGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) REAL(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by DGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array -! On exit, WORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain -! scaling factor WORK(N+2)/WORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to DGEDMD is only workspace query, then -! WORK(1) contains the minimal workspace length and -! WORK(2) is the optimal workspace length. Hence, the -! leng of work is at least 2. -! See the description of LWORK. +!> \param[out] WORK +!> \verbatim +!> WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +!> On exit, WORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +!> scaling factor WORK(N+2)/WORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to DGEDMD is only workspace query, then +!> WORK(1) contains the minimal workspace length and +!> WORK(2) is the optimal workspace length. Hence, the +!> leng of work is at least 2. +!> See the description of LWORK. +!> \endverbatim !..... -! LWORK (input) INTEGER -! The minimal length of the workspace vector WORK. -! LWORK is calculated as follows: -! If WHTSVD == 1 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). -! If JOBZ == 'N' then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). -! Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal -! workspace length of DGESVD. -! If WHTSVD == 2 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the -! minimal workspace length of DGESDD. -! If WHTSVD == 3 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = N+M+MAX(3*N+1, -! MAX(1,3*N+M,5*N),MAX(1,N)) -! is the minimal workspace length of DGESVDQ. -! If WHTSVD == 4 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the -! minimal workspace length of DGEJSV. -! The above expressions are not simplified in order to -! make the usage of WORK more transparent, and for -! easier checking. In any case, LWORK >= 2. -! If on entry LWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LWORK +!> \verbatim +!> LWORK (input) INTEGER +!> The minimal length of the workspace vector WORK. +!> LWORK is calculated as follows: +!> If WHTSVD == 1 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +!> If JOBZ == 'N' then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +!> Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +!> workspace length of DGESVD. +!> If WHTSVD == 2 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +!> minimal workspace length of DGESDD. +!> If WHTSVD == 3 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = N+M+MAX(3*N+1, +!> MAX(1,3*N+M,5*N),MAX(1,N)) +!> is the minimal workspace length of DGESVDQ. +!> If WHTSVD == 4 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +!> minimal workspace length of DGEJSV. +!> The above expressions are not simplified in order to +!> make the usage of WORK more transparent, and for +!> easier checking. In any case, LWORK >= 2. +!> If on entry LWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -432,10 +582,11 @@ WNTEX, WNTREF, WNTRES, WNTVEC CHARACTER :: JOBZL, T_OR_N CHARACTER :: JSVOPT - +! ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) DLANGE, DLAMCH, DNRM2 @@ -443,13 +594,13 @@ INTEGER IDAMAX LOGICAL DISNAN, LSAME EXTERNAL DISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL DAXPY, DGEMM, DSCAL EXTERNAL DGEEV, DGEJSV, DGESDD, DGESVD, DGESVDQ, & DLACPY, DLASCL, DLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC DBLE, INT, MAX, SQRT @@ -632,7 +783,8 @@ K = 0 DO i = 1, N !WORK(i) = DNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -705,7 +857,8 @@ ! carefully computed using DLASSQ. DO i = 1, N !WORK(i) = DNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -1051,4 +1204,3 @@ RETURN ! ...... END SUBROUTINE DGEDMD - diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90 index 49cb11527..4860e8898 100644 --- a/lapack-netlib/SRC/sgedmd.f90 +++ b/lapack-netlib/SRC/sgedmd.f90 @@ -1,423 +1,573 @@ - SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, REIG, IMEIG, Z, LDZ, RES, & - B, LDB, W, LDW, S, LDS, & - WORK, LWORK, IWORK, LIWORK, INFO ) -! March 2023 +!> \brief \b SGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== +! +! SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, REIG, IMEIG, Z, LDZ, RES, & +! B, LDB, W, LDW, S, LDS, & +! WORK, LWORK, IWORK, LIWORK, INFO ) !..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real32 +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real32 !..... ! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LWORK, LIWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LWORK, LIWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL ! Array arguments - REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & - RES(*) - REAL(KIND=WP), INTENT(OUT) :: WORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! SGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, SGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, SGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +! REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & +! RES(*) +! REAL(KIND=WP), INTENT(OUT) :: WORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) ! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> SGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, SGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, SGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim !...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim !...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Distribution Statement A: +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim +!============================================================ ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: SGESVD (the QR SVD algorithm) -! 2 :: SGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: SGESVD (the QR SVD algorithm) +!> 2 :: SGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: SGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) REAL(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in,out] X +!> \verbatim +!> X (input/output) REAL(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. +!> \endverbatim !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> \param[in] LDX +!> \verbatim +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) REAL(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) REAL(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! REIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of REIG contain -! the real parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! See the descriptions of K, IMEIG, and Z. +!> \param[out] REIG +!> \verbatim +!> REIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of REIG contain +!> the real parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> See the descriptions of K, IMEIG, and Z. +!> \endverbatim !..... -! IMEIG (output) REAL(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of IMEIG contain -! the imaginary parts of the computed eigenvalues -! REIG(1:K) + sqrt(-1)*IMEIG(1:K). -! The eigenvalues are determined as follows: -! If IMEIG(i) == 0, then the corresponding eigenvalue is -! real, LAMBDA(i) = REIG(i). -! If IMEIG(i)>0, then the corresponding complex -! conjugate pair of eigenvalues reads -! LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) -! LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) -! That is, complex conjugate pairs have consecutive -! indices (i,i+1), with the positive imaginary part -! listed first. -! See the descriptions of K, REIG, and Z. +!> \param[out] IMEIG +!> \verbatim +!> IMEIG (output) REAL(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of IMEIG contain +!> the imaginary parts of the computed eigenvalues +!> REIG(1:K) + sqrt(-1)*IMEIG(1:K). +!> The eigenvalues are determined as follows: +!> If IMEIG(i) == 0, then the corresponding eigenvalue is +!> real, LAMBDA(i) = REIG(i). +!> If IMEIG(i)>0, then the corresponding complex +!> conjugate pair of eigenvalues reads +!> LAMBDA(i) = REIG(i) + sqrt(-1)*IMAG(i) +!> LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) +!> That is, complex conjugate pairs have consecutive +!> indices (i,i+1), with the positive imaginary part +!> listed first. +!> See the descriptions of K, REIG, and Z. +!> \endverbatim !..... -! Z (workspace/output) REAL(KIND=WP) M-by-N array -! If JOBZ =='V' then -! Z contains real Ritz vectors as follows: -! If IMEIG(i)=0, then Z(:,i) is an eigenvector of -! the i-th Ritz value; ||Z(:,i)||_2=1. -! If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then -! [Z(:,i) Z(:,i+1)] span an invariant subspace and -! the Ritz values extracted from this subspace are -! REIG(i) + sqrt(-1)*IMEIG(i) and -! REIG(i) - sqrt(-1)*IMEIG(i). -! The corresponding eigenvectors are -! Z(:,i) + sqrt(-1)*Z(:,i+1) and -! Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. -! || Z(:,i:i+1)||_F = 1. -! If JOBZ == 'F', then the above descriptions hold for -! the columns of X(:,1:K)*W(1:K,1:K), where the columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. The columns of W(1:K,1:K) -! are similarly structured: If IMEIG(i) == 0 then -! X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 -! then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and -! X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) -! are the eigenvectors of LAMBDA(i), LAMBDA(i+1). -! See the descriptions of REIG, IMEIG, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) REAL(KIND=WP) M-by-N array +!> If JOBZ =='V' then +!> Z contains real Ritz vectors as follows: +!> If IMEIG(i)=0, then Z(:,i) is an eigenvector of +!> the i-th Ritz value; ||Z(:,i)||_2=1. +!> If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then +!> [Z(:,i) Z(:,i+1)] span an invariant subspace and +!> the Ritz values extracted from this subspace are +!> REIG(i) + sqrt(-1)*IMEIG(i) and +!> REIG(i) - sqrt(-1)*IMEIG(i). +!> The corresponding eigenvectors are +!> Z(:,i) + sqrt(-1)*Z(:,i+1) and +!> Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. +!> || Z(:,i:i+1)||_F = 1. +!> If JOBZ == 'F', then the above descriptions hold for +!> the columns of X(:,1:K)*W(1:K,1:K), where the columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. The columns of W(1:K,1:K) +!> are similarly structured: If IMEIG(i) == 0 then +!> X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 +!> then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and +!> X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) +!> are the eigenvectors of LAMBDA(i), LAMBDA(i+1). +!> See the descriptions of REIG, IMEIG, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs. -! If LAMBDA(i) is real, then -! RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. -! If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair -! then -! RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F -! where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] -! [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. -! It holds that -! RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 -! RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 -! where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) -! ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) -! See the description of REIG, IMEIG and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs. +!> If LAMBDA(i) is real, then +!> RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. +!> If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair +!> then +!> RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F +!> where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] +!> [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. +!> It holds that +!> RES(i) = || A*ZC(:,i) - LAMBDA(i) *ZC(:,i) ||_2 +!> RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 +!> where ZC(:,i) = Z(:,i) + sqrt(-1)*Z(:,i+1) +!> ZC(:,i+1) = Z(:,i) - sqrt(-1)*Z(:,i+1) +!> See the description of REIG, IMEIG and Z. +!> \endverbatim !..... -! B (output) REAL(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1;K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) REAL(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1;K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) REAL(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient (real and -! imaginary parts for each complex conjugate pair of the -! eigenvalues). The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! left singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) REAL(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient (real and +!> imaginary parts for each complex conjugate pair of the +!> eigenvalues). The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> left singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) REAL(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by SGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) REAL(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by SGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array -! On exit, WORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain -! scaling factor WORK(N+2)/WORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to SGEDMD is only workspace query, then -! WORK(1) contains the minimal workspace length and -! WORK(2) is the optimal workspace length. Hence, the -! length of work is at least 2. -! See the description of LWORK. +!> \param[out] WORK +!> \verbatim +!> WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array +!> On exit, WORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain +!> scaling factor WORK(N+2)/WORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to SGEDMD is only workspace query, then +!> WORK(1) contains the minimal workspace length and +!> WORK(2) is the optimal workspace length. Hence, the +!> length of work is at least 2. +!> See the description of LWORK. +!> \endverbatim !..... -! LWORK (input) INTEGER -! The minimal length of the workspace vector WORK. -! LWORK is calculated as follows: -! If WHTSVD == 1 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). -! If JOBZ == 'N' then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). -! Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal -! workspace length of SGESVD. -! If WHTSVD == 2 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the -! minimal workspace length of SGESDD. -! If WHTSVD == 3 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = N+M+MAX(3*N+1, -! MAX(1,3*N+M,5*N),MAX(1,N)) -! is the minimal workspace length of SGESVDQ. -! If WHTSVD == 4 :: -! If JOBZ == 'V', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) -! If JOBZ == 'N', then -! LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) -! Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the -! minimal workspace length of SGEJSV. -! The above expressions are not simplified in order to -! make the usage of WORK more transparent, and for -! easier checking. In any case, LWORK >= 2. -! If on entry LWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LWORK +!> \verbatim +!> LWORK (input) INTEGER +!> The minimal length of the workspace vector WORK. +!> LWORK is calculated as follows: +!> If WHTSVD == 1 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). +!> If JOBZ == 'N' then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). +!> Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal +!> workspace length of SGESVD. +!> If WHTSVD == 2 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the +!> minimal workspace length of SGESDD. +!> If WHTSVD == 3 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = N+M+MAX(3*N+1, +!> MAX(1,3*N+M,5*N),MAX(1,N)) +!> is the minimal workspace length of SGESVDQ. +!> If WHTSVD == 4 :: +!> If JOBZ == 'V', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) +!> If JOBZ == 'N', then +!> LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) +!> Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the +!> minimal workspace length of SGEJSV. +!> The above expressions are not simplified in order to +!> make the usage of WORK more transparent, and for +!> easier checking. In any case, LWORK >= 2. +!> If on entry LWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for both WORK and -! IWORK. See the descriptions of WORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for both WORK and +!> IWORK. See the descriptions of WORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, REIG, IMEIG, Z, LDZ, RES, & + B, LDB, W, LDW, S, LDS, & + WORK, LWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real32 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LWORK, LIWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + REAL(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + REAL(KIND=WP), INTENT(OUT) :: REIG(*), IMEIG(*), & + RES(*) + REAL(KIND=WP), INTENT(OUT) :: WORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -431,11 +581,11 @@ WNTEX, WNTREF, WNTRES, WNTVEC CHARACTER :: JOBZL, T_OR_N CHARACTER :: JSVOPT - +! ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) SLANGE, SLAMCH, SNRM2 @@ -443,13 +593,13 @@ INTEGER ISAMAX LOGICAL SISNAN, LSAME EXTERNAL SISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL SAXPY, SGEMM, SSCAL EXTERNAL SGEEV, SGEJSV, SGESDD, SGESVD, SGESVDQ, & SLACPY, SLASCL, SLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC INT, FLOAT, MAX, SQRT @@ -632,7 +782,8 @@ K = 0 DO i = 1, N !WORK(i) = DNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 @@ -705,7 +856,8 @@ ! carefully computed using SLASSQ. DO i = 1, N !WORK(i) = DNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN K = 0 diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90 index 090641ad8..5045cb166 100644 --- a/lapack-netlib/SRC/zgedmd.f90 +++ b/lapack-netlib/SRC/zgedmd.f90 @@ -1,389 +1,539 @@ - SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & - M, N, X, LDX, Y, LDY, NRNK, TOL, & - K, EIGS, Z, LDZ, RES, B, LDB, & - W, LDW, S, LDS, ZWORK, LZWORK, & - RWORK, LRWORK, IWORK, LIWORK, INFO ) -! March 2023 -!..... - USE iso_fortran_env - IMPLICIT NONE - INTEGER, PARAMETER :: WP = real64 - -!..... -! Scalar arguments - CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF - INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & - NRNK, LDZ, LDB, LDW, LDS, & - LIWORK, LRWORK, LZWORK - INTEGER, INTENT(OUT) :: K, INFO - REAL(KIND=WP), INTENT(IN) :: TOL -! Array arguments - COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) - COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & - W(LDW,*), S(LDS,*) - COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) - COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) - REAL(KIND=WP), INTENT(OUT) :: RES(*) - REAL(KIND=WP), INTENT(OUT) :: RWORK(*) - INTEGER, INTENT(OUT) :: IWORK(*) -!............................................................ -! Purpose -! ======= -! ZGEDMD computes the Dynamic Mode Decomposition (DMD) for -! a pair of data snapshot matrices. For the input matrices -! X and Y such that Y = A*X with an unaccessible matrix -! A, ZGEDMD computes a certain number of Ritz pairs of A using -! the standard Rayleigh-Ritz extraction from a subspace of -! range(X) that is determined using the leading left singular -! vectors of X. Optionally, ZGEDMD returns the residuals -! of the computed Ritz pairs, the information needed for -! a refinement of the Ritz vectors, or the eigenvectors of -! the Exact DMD. -! For further details see the references listed -! below. For more details of the implementation see [3]. -! -! References -! ========== -! [1] P. Schmid: Dynamic mode decomposition of numerical -! and experimental data, -! Journal of Fluid Mechanics 656, 5-28, 2010. -! [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal -! decompositions: analysis and enhancements, -! SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. -! [3] Z. Drmac: A LAPACK implementation of the Dynamic -! Mode Decomposition I. Technical report. AIMDyn Inc. -! and LAPACK Working Note 298. -! [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. -! Brunton, N. Kutz: On Dynamic Mode Decomposition: -! Theory and Applications, Journal of Computational -! Dynamics 1(2), 391 -421, 2014. +!> \brief \b ZGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices. +! +! =========== DOCUMENTATION =========== +! +! Definition: +! =========== ! +! SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & +! M, N, X, LDX, Y, LDY, NRNK, TOL, & +! K, EIGS, Z, LDZ, RES, B, LDB, & +! W, LDW, S, LDS, ZWORK, LZWORK, & +! RWORK, LRWORK, IWORK, LIWORK, INFO ) +!...... +! USE iso_fortran_env +! IMPLICIT NONE +! INTEGER, PARAMETER :: WP = real64 +! +!...... +! Scalar arguments +! CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF +! INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & +! NRNK, LDZ, LDB, LDW, LDS, & +! LIWORK, LRWORK, LZWORK +! INTEGER, INTENT(OUT) :: K, INFO +! REAL(KIND=WP), INTENT(IN) :: TOL +! Array arguments +! COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & +! W(LDW,*), S(LDS,*) +! COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) +! COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) +! REAL(KIND=WP), INTENT(OUT) :: RES(*) +! REAL(KIND=WP), INTENT(OUT) :: RWORK(*) +! INTEGER, INTENT(OUT) :: IWORK(*) +! +!............................................................ +!> \par Purpose: +! ============= +!> \verbatim +!> ZGEDMD computes the Dynamic Mode Decomposition (DMD) for +!> a pair of data snapshot matrices. For the input matrices +!> X and Y such that Y = A*X with an unaccessible matrix +!> A, ZGEDMD computes a certain number of Ritz pairs of A using +!> the standard Rayleigh-Ritz extraction from a subspace of +!> range(X) that is determined using the leading left singular +!> vectors of X. Optionally, ZGEDMD returns the residuals +!> of the computed Ritz pairs, the information needed for +!> a refinement of the Ritz vectors, or the eigenvectors of +!> the Exact DMD. +!> For further details see the references listed +!> below. For more details of the implementation see [3]. +!> \endverbatim +!............................................................ +!> \par References: +! ================ +!> \verbatim +!> [1] P. Schmid: Dynamic mode decomposition of numerical +!> and experimental data, +!> Journal of Fluid Mechanics 656, 5-28, 2010. +!> [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal +!> decompositions: analysis and enhancements, +!> SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. +!> [3] Z. Drmac: A LAPACK implementation of the Dynamic +!> Mode Decomposition I. Technical report. AIMDyn Inc. +!> and LAPACK Working Note 298. +!> [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. +!> Brunton, N. Kutz: On Dynamic Mode Decomposition: +!> Theory and Applications, Journal of Computational +!> Dynamics 1(2), 391 -421, 2014. +!> \endverbatim !...................................................................... -! Developed and supported by: -! =========================== -! Developed and coded by Zlatko Drmac, Faculty of Science, -! University of Zagreb; drmac@math.hr -! In cooperation with -! AIMdyn Inc., Santa Barbara, CA. -! and supported by -! - DARPA SBIR project "Koopman Operator-Based Forecasting -! for Nonstationary Processes from Near-Term, Limited -! Observational Data" Contract No: W31P4Q-21-C-0007 -! - DARPA PAI project "Physics-Informed Machine Learning -! Methodologies" Contract No: HR0011-18-9-0033 -! - DARPA MoDyL project "A Data-Driven, Operator-Theoretic -! Framework for Space-Time Analysis of Process Dynamics" -! Contract No: HR0011-16-C-0116 -! Any opinions, findings and conclusions or recommendations -! expressed in this material are those of the author and -! do not necessarily reflect the views of the DARPA SBIR -! Program Office -!============================================================ -! Distribution Statement A: -! Approved for Public Release, Distribution Unlimited. -! Cleared by DARPA on September 29, 2022 -!============================================================ +!> \par Developed and supported by: +! ================================ +!> \verbatim +!> Developed and coded by Zlatko Drmac, Faculty of Science, +!> University of Zagreb; drmac@math.hr +!> In cooperation with +!> AIMdyn Inc., Santa Barbara, CA. +!> and supported by +!> - DARPA SBIR project "Koopman Operator-Based Forecasting +!> for Nonstationary Processes from Near-Term, Limited +!> Observational Data" Contract No: W31P4Q-21-C-0007 +!> - DARPA PAI project "Physics-Informed Machine Learning +!> Methodologies" Contract No: HR0011-18-9-0033 +!> - DARPA MoDyL project "A Data-Driven, Operator-Theoretic +!> Framework for Space-Time Analysis of Process Dynamics" +!> Contract No: HR0011-16-C-0116 +!> Any opinions, findings and conclusions or recommendations +!> expressed in this material are those of the author and +!> do not necessarily reflect the views of the DARPA SBIR +!> Program Office +!> \endverbatim +!...................................................................... +!> \par Distribution Statement A: +! ============================== +!> \verbatim +!> Approved for Public Release, Distribution Unlimited. +!> Cleared by DARPA on September 29, 2022 +!> \endverbatim !............................................................ ! Arguments ! ========= -! JOBS (input) CHARACTER*1 -! Determines whether the initial data snapshots are scaled -! by a diagonal matrix. -! 'S' :: The data snapshots matrices X and Y are multiplied -! with a diagonal matrix D so that X*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'C' :: The snapshots are scaled as with the 'S' option. -! If it is found that an i-th column of X is zero -! vector and the corresponding i-th column of Y is -! non-zero, then the i-th column of Y is set to -! zero and a warning flag is raised. -! 'Y' :: The data snapshots matrices X and Y are multiplied -! by a diagonal matrix D so that Y*D has unit -! nonzero columns (in the Euclidean 2-norm) -! 'N' :: No data scaling. +! +!> \param[in] JOBS +!> \verbatim +!> JOBS (input) CHARACTER*1 +!> Determines whether the initial data snapshots are scaled +!> by a diagonal matrix. +!> 'S' :: The data snapshots matrices X and Y are multiplied +!> with a diagonal matrix D so that X*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'C' :: The snapshots are scaled as with the 'S' option. +!> If it is found that an i-th column of X is zero +!> vector and the corresponding i-th column of Y is +!> non-zero, then the i-th column of Y is set to +!> zero and a warning flag is raised. +!> 'Y' :: The data snapshots matrices X and Y are multiplied +!> by a diagonal matrix D so that Y*D has unit +!> nonzero columns (in the Euclidean 2-norm) +!> 'N' :: No data scaling. +!> \endverbatim !..... -! JOBZ (input) CHARACTER*1 -! Determines whether the eigenvectors (Koopman modes) will -! be computed. -! 'V' :: The eigenvectors (Koopman modes) will be computed -! and returned in the matrix Z. -! See the description of Z. -! 'F' :: The eigenvectors (Koopman modes) will be returned -! in factored form as the product X(:,1:K)*W, where X -! contains a POD basis (leading left singular vectors -! of the data matrix X) and W contains the eigenvectors -! of the corresponding Rayleigh quotient. -! See the descriptions of K, X, W, Z. -! 'N' :: The eigenvectors are not computed. +!> \param[in] JOBZ +!> \verbatim +!> JOBZ (input) CHARACTER*1 +!> Determines whether the eigenvectors (Koopman modes) will +!> be computed. +!> 'V' :: The eigenvectors (Koopman modes) will be computed +!> and returned in the matrix Z. +!> See the description of Z. +!> 'F' :: The eigenvectors (Koopman modes) will be returned +!> in factored form as the product X(:,1:K)*W, where X +!> contains a POD basis (leading left singular vectors +!> of the data matrix X) and W contains the eigenvectors +!> of the corresponding Rayleigh quotient. +!> See the descriptions of K, X, W, Z. +!> 'N' :: The eigenvectors are not computed. +!> \endverbatim !..... -! JOBR (input) CHARACTER*1 -! Determines whether to compute the residuals. -! 'R' :: The residuals for the computed eigenpairs will be -! computed and stored in the array RES. -! See the description of RES. -! For this option to be legal, JOBZ must be 'V'. -! 'N' :: The residuals are not computed. +!> \param[in] JOBR +!> \verbatim +!> JOBR (input) CHARACTER*1 +!> Determines whether to compute the residuals. +!> 'R' :: The residuals for the computed eigenpairs will be +!> computed and stored in the array RES. +!> See the description of RES. +!> For this option to be legal, JOBZ must be 'V'. +!> 'N' :: The residuals are not computed. +!> \endverbatim !..... -! JOBF (input) CHARACTER*1 -! Specifies whether to store information needed for post- -! processing (e.g. computing refined Ritz vectors) -! 'R' :: The matrix needed for the refinement of the Ritz -! vectors is computed and stored in the array B. -! See the description of B. -! 'E' :: The unscaled eigenvectors of the Exact DMD are -! computed and returned in the array B. See the -! description of B. -! 'N' :: No eigenvector refinement data is computed. +!> \param[in] JOBF +!> \verbatim +!> JOBF (input) CHARACTER*1 +!> Specifies whether to store information needed for post- +!> processing (e.g. computing refined Ritz vectors) +!> 'R' :: The matrix needed for the refinement of the Ritz +!> vectors is computed and stored in the array B. +!> See the description of B. +!> 'E' :: The unscaled eigenvectors of the Exact DMD are +!> computed and returned in the array B. See the +!> description of B. +!> 'N' :: No eigenvector refinement data is computed. +!> \endverbatim !..... -! WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } -! Allows for a selection of the SVD algorithm from the -! LAPACK library. -! 1 :: ZGESVD (the QR SVD algorithm) -! 2 :: ZGESDD (the Divide and Conquer algorithm; if enough -! workspace available, this is the fastest option) -! 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 -! are the most accurate options) -! 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 -! are the most accurate options) -! For the four methods above, a significant difference in -! the accuracy of small singular values is possible if -! the snapshots vary in norm so that X is severely -! ill-conditioned. If small (smaller than EPS*||X||) -! singular values are of interest and JOBS=='N', then -! the options (3, 4) give the most accurate results, where -! the option 4 is slightly better and with stronger -! theoretical background. -! If JOBS=='S', i.e. the columns of X will be normalized, -! then all methods give nearly equally accurate results. +!> \param[in] WHTSVD +!> \verbatim +!> WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } +!> Allows for a selection of the SVD algorithm from the +!> LAPACK library. +!> 1 :: ZGESVD (the QR SVD algorithm) +!> 2 :: ZGESDD (the Divide and Conquer algorithm; if enough +!> workspace available, this is the fastest option) +!> 3 :: ZGESVDQ (the preconditioned QR SVD ; this and 4 +!> are the most accurate options) +!> 4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 +!> are the most accurate options) +!> For the four methods above, a significant difference in +!> the accuracy of small singular values is possible if +!> the snapshots vary in norm so that X is severely +!> ill-conditioned. If small (smaller than EPS*||X||) +!> singular values are of interest and JOBS=='N', then +!> the options (3, 4) give the most accurate results, where +!> the option 4 is slightly better and with stronger +!> theoretical background. +!> If JOBS=='S', i.e. the columns of X will be normalized, +!> then all methods give nearly equally accurate results. +!> \endverbatim !..... -! M (input) INTEGER, M>= 0 -! The state space dimension (the row dimension of X, Y). +!> \param[in] M +!> \verbatim +!> M (input) INTEGER, M>= 0 +!> The state space dimension (the row dimension of X, Y). +!> \endverbatim !..... -! N (input) INTEGER, 0 <= N <= M -! The number of data snapshot pairs -! (the number of columns of X and Y). +!> \param[in] N +!> \verbatim +!> N (input) INTEGER, 0 <= N <= M +!> The number of data snapshot pairs +!> (the number of columns of X and Y). +!> \endverbatim !..... -! X (input/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, X contains the data snapshot matrix X. It is -! assumed that the column norms of X are in the range of -! the normalized floating point numbers. -! < On exit, the leading K columns of X contain a POD basis, -! i.e. the leading K left singular vectors of the input -! data matrix X, U(:,1:K). All N columns of X contain all -! left singular vectors of the input matrix X. -! See the descriptions of K, Z and W. +!> \param[in] LDX +!> \verbatim +!> X (input/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, X contains the data snapshot matrix X. It is +!> assumed that the column norms of X are in the range of +!> the normalized floating point numbers. +!> < On exit, the leading K columns of X contain a POD basis, +!> i.e. the leading K left singular vectors of the input +!> data matrix X, U(:,1:K). All N columns of X contain all +!> left singular vectors of the input matrix X. +!> See the descriptions of K, Z and W. !..... -! LDX (input) INTEGER, LDX >= M -! The leading dimension of the array X. +!> LDX (input) INTEGER, LDX >= M +!> The leading dimension of the array X. +!> \endverbatim !..... -! Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array -! > On entry, Y contains the data snapshot matrix Y -! < On exit, -! If JOBR == 'R', the leading K columns of Y contain -! the residual vectors for the computed Ritz pairs. -! See the description of RES. -! If JOBR == 'N', Y contains the original input data, -! scaled according to the value of JOBS. +!> \param[in,out] Y +!> \verbatim +!> Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array +!> > On entry, Y contains the data snapshot matrix Y +!> < On exit, +!> If JOBR == 'R', the leading K columns of Y contain +!> the residual vectors for the computed Ritz pairs. +!> See the description of RES. +!> If JOBR == 'N', Y contains the original input data, +!> scaled according to the value of JOBS. +!> \endverbatim !..... -! LDY (input) INTEGER , LDY >= M -! The leading dimension of the array Y. +!> \param[in] LDY +!> \verbatim +!> LDY (input) INTEGER , LDY >= M +!> The leading dimension of the array Y. +!> \endverbatim !..... -! NRNK (input) INTEGER -! Determines the mode how to compute the numerical rank, -! i.e. how to truncate small singular values of the input -! matrix X. On input, if -! NRNK = -1 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(1) -! This option is recommended. -! NRNK = -2 :: i-th singular value sigma(i) is truncated -! if sigma(i) <= TOL*sigma(i-1) -! This option is included for R&D purposes. -! It requires highly accurate SVD, which -! may not be feasible. -! The numerical rank can be enforced by using positive -! value of NRNK as follows: -! 0 < NRNK <= N :: at most NRNK largest singular values -! will be used. If the number of the computed nonzero -! singular values is less than NRNK, then only those -! nonzero values will be used and the actually used -! dimension is less than NRNK. The actual number of -! the nonzero singular values is returned in the variable -! K. See the descriptions of TOL and K. +!> \param[in] NRNK +!> \verbatim +!> NRNK (input) INTEGER +!> Determines the mode how to compute the numerical rank, +!> i.e. how to truncate small singular values of the input +!> matrix X. On input, if +!> NRNK = -1 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(1) +!> This option is recommended. +!> NRNK = -2 :: i-th singular value sigma(i) is truncated +!> if sigma(i) <= TOL*sigma(i-1) +!> This option is included for R&D purposes. +!> It requires highly accurate SVD, which +!> may not be feasible. +!> The numerical rank can be enforced by using positive +!> value of NRNK as follows: +!> 0 < NRNK <= N :: at most NRNK largest singular values +!> will be used. If the number of the computed nonzero +!> singular values is less than NRNK, then only those +!> nonzero values will be used and the actually used +!> dimension is less than NRNK. The actual number of +!> the nonzero singular values is returned in the variable +!> K. See the descriptions of TOL and K. +!> \endverbatim !..... -! TOL (input) REAL(KIND=WP), 0 <= TOL < 1 -! The tolerance for truncating small singular values. -! See the description of NRNK. +!> \param[in] TOL +!> \verbatim +!> TOL (input) REAL(KIND=WP), 0 <= TOL < 1 +!> The tolerance for truncating small singular values. +!> See the description of NRNK. +!> \endverbatim !..... -! K (output) INTEGER, 0 <= K <= N -! The dimension of the POD basis for the data snapshot -! matrix X and the number of the computed Ritz pairs. -! The value of K is determined according to the rule set -! by the parameters NRNK and TOL. -! See the descriptions of NRNK and TOL. +!> \param[out] K +!> \verbatim +!> K (output) INTEGER, 0 <= K <= N +!> The dimension of the POD basis for the data snapshot +!> matrix X and the number of the computed Ritz pairs. +!> The value of K is determined according to the rule set +!> by the parameters NRNK and TOL. +!> See the descriptions of NRNK and TOL. +!> \endverbatim !..... -! EIGS (output) COMPLEX(KIND=WP) N-by-1 array -! The leading K (K<=N) entries of EIGS contain -! the computed eigenvalues (Ritz values). -! See the descriptions of K, and Z. +!> \param[out] EIGS +!> \verbatim +!> EIGS (output) COMPLEX(KIND=WP) N-by-1 array +!> The leading K (K<=N) entries of EIGS contain +!> the computed eigenvalues (Ritz values). +!> See the descriptions of K, and Z. +!> \endverbatim !..... -! Z (workspace/output) COMPLEX(KIND=WP) M-by-N array -! If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) -! is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. -! If JOBZ == 'F', then the Z(:,i)'s are given implicitly as -! the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) -! is an eigenvector corresponding to EIGS(i). The columns -! of W(1:k,1:K) are the computed eigenvectors of the -! K-by-K Rayleigh quotient. -! See the descriptions of EIGS, X and W. +!> \param[out] Z +!> \verbatim +!> Z (workspace/output) COMPLEX(KIND=WP) M-by-N array +!> If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) +!> is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. +!> If JOBZ == 'F', then the Z(:,i)'s are given implicitly as +!> the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) +!> is an eigenvector corresponding to EIGS(i). The columns +!> of W(1:k,1:K) are the computed eigenvectors of the +!> K-by-K Rayleigh quotient. +!> See the descriptions of EIGS, X and W. +!> \endverbatim !..... -! LDZ (input) INTEGER , LDZ >= M -! The leading dimension of the array Z. +!> \param[in] LDZ +!> \verbatim +!> LDZ (input) INTEGER , LDZ >= M +!> The leading dimension of the array Z. +!> \endverbatim !..... -! RES (output) REAL(KIND=WP) N-by-1 array -! RES(1:K) contains the residuals for the K computed -! Ritz pairs, -! RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. -! See the description of EIGS and Z. +!> \param[out] RES +!> \verbatim +!> RES (output) REAL(KIND=WP) N-by-1 array +!> RES(1:K) contains the residuals for the K computed +!> Ritz pairs, +!> RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. +!> See the description of EIGS and Z. +!> \endverbatim !..... -! B (output) COMPLEX(KIND=WP) M-by-N array. -! IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can -! be used for computing the refined vectors; see further -! details in the provided references. -! If JOBF == 'E', B(1:M,1:K) contains -! A*U(:,1:K)*W(1:K,1:K), which are the vectors from the -! Exact DMD, up to scaling by the inverse eigenvalues. -! If JOBF =='N', then B is not referenced. -! See the descriptions of X, W, K. +!> \param[out] B +!> \verbatim +!> B (output) COMPLEX(KIND=WP) M-by-N array. +!> IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can +!> be used for computing the refined vectors; see further +!> details in the provided references. +!> If JOBF == 'E', B(1:M,1:K) contains +!> A*U(:,1:K)*W(1:K,1:K), which are the vectors from the +!> Exact DMD, up to scaling by the inverse eigenvalues. +!> If JOBF =='N', then B is not referenced. +!> See the descriptions of X, W, K. +!> \endverbatim !..... -! LDB (input) INTEGER, LDB >= M -! The leading dimension of the array B. +!> \param[in] LDB +!> \verbatim +!> LDB (input) INTEGER, LDB >= M +!> The leading dimension of the array B. +!> \endverbatim !..... -! W (workspace/output) COMPLEX(KIND=WP) N-by-N array -! On exit, W(1:K,1:K) contains the K computed -! eigenvectors of the matrix Rayleigh quotient. -! The Ritz vectors (returned in Z) are the -! product of X (containing a POD basis for the input -! matrix X) and W. See the descriptions of K, S, X and Z. -! W is also used as a workspace to temporarily store the -! right singular vectors of X. +!> \param[out] W +!> \verbatim +!> W (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> On exit, W(1:K,1:K) contains the K computed +!> eigenvectors of the matrix Rayleigh quotient. +!> The Ritz vectors (returned in Z) are the +!> product of X (containing a POD basis for the input +!> matrix X) and W. See the descriptions of K, S, X and Z. +!> W is also used as a workspace to temporarily store the +!> right singular vectors of X. +!> \endverbatim !..... -! LDW (input) INTEGER, LDW >= N -! The leading dimension of the array W. +!> \param[in] LDW +!> \verbatim +!> LDW (input) INTEGER, LDW >= N +!> The leading dimension of the array W. +!> \endverbatim !..... -! S (workspace/output) COMPLEX(KIND=WP) N-by-N array -! The array S(1:K,1:K) is used for the matrix Rayleigh -! quotient. This content is overwritten during -! the eigenvalue decomposition by ZGEEV. -! See the description of K. +!> \param[out] S +!> \verbatim +!> S (workspace/output) COMPLEX(KIND=WP) N-by-N array +!> The array S(1:K,1:K) is used for the matrix Rayleigh +!> quotient. This content is overwritten during +!> the eigenvalue decomposition by ZGEEV. +!> See the description of K. +!> \endverbatim !..... -! LDS (input) INTEGER, LDS >= N -! The leading dimension of the array S. +!> \param[in] LDS +!> \verbatim +!> LDS (input) INTEGER, LDS >= N +!> The leading dimension of the array S. +!> \endverbatim !..... -! ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array -! ZWORK is used as complex workspace in the complex SVD, as -! specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing -! the eigenvalues of a Rayleigh quotient. -! If the call to ZGEDMD is only workspace query, then -! ZWORK(1) contains the minimal complex workspace length and -! ZWORK(2) is the optimal complex workspace length. -! Hence, the length of work is at least 2. -! See the description of LZWORK. +!> \param[out] ZWORK +!> \verbatim +!> ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array +!> ZWORK is used as complex workspace in the complex SVD, as +!> specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing +!> the eigenvalues of a Rayleigh quotient. +!> If the call to ZGEDMD is only workspace query, then +!> ZWORK(1) contains the minimal complex workspace length and +!> ZWORK(2) is the optimal complex workspace length. +!> Hence, the length of work is at least 2. +!> See the description of LZWORK. +!> \endverbatim !..... -! LZWORK (input) INTEGER -! The minimal length of the workspace vector ZWORK. -! LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), -! where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal -! LZWORK_SVD is calculated as follows -! If WHTSVD == 1 :: ZGESVD :: -! LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) -! If WHTSVD == 2 :: ZGESDD :: -! LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) -! If WHTSVD == 3 :: ZGESVDQ :: -! LZWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: ZGEJSV :: -! LZWORK_SVD = obtainable by a query -! If on entry LZWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths and returns them in -! LZWORK(1) and LZWORK(2), respectively. +!> \param[in] LZWORK +!> \verbatim +!> LZWORK (input) INTEGER +!> The minimal length of the workspace vector ZWORK. +!> LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), +!> where LZWORK_ZGEEV = MAX( 1, 2*N ) and the minimal +!> LZWORK_SVD is calculated as follows +!> If WHTSVD == 1 :: ZGESVD :: +!> LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) +!> If WHTSVD == 2 :: ZGESDD :: +!> LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) +!> If WHTSVD == 3 :: ZGESVDQ :: +!> LZWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: ZGEJSV :: +!> LZWORK_SVD = obtainable by a query +!> If on entry LZWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths and returns them in +!> LZWORK(1) and LZWORK(2), respectively. +!> \endverbatim !..... -! RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array -! On exit, RWORK(1:N) contains the singular values of -! X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). -! If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain -! scaling factor RWORK(N+2)/RWORK(N+1) used to scale X -! and Y to avoid overflow in the SVD of X. -! This may be of interest if the scaling option is off -! and as many as possible smallest eigenvalues are -! desired to the highest feasible accuracy. -! If the call to ZGEDMD is only workspace query, then -! RWORK(1) contains the minimal workspace length. -! See the description of LRWORK. +!> \param[out] RWORK +!> \verbatim +!> RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array +!> On exit, RWORK(1:N) contains the singular values of +!> X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). +!> If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain +!> scaling factor RWORK(N+2)/RWORK(N+1) used to scale X +!> and Y to avoid overflow in the SVD of X. +!> This may be of interest if the scaling option is off +!> and as many as possible smallest eigenvalues are +!> desired to the highest feasible accuracy. +!> If the call to ZGEDMD is only workspace query, then +!> RWORK(1) contains the minimal workspace length. +!> See the description of LRWORK. +!> \endverbatim !..... -! LRWORK (input) INTEGER -! The minimal length of the workspace vector RWORK. -! LRWORK is calculated as follows: -! LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where -! LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace -! for the SVD subroutine determined by the input parameter -! WHTSVD. -! If WHTSVD == 1 :: ZGESVD :: -! LRWORK_SVD = 5*MIN(M,N) -! If WHTSVD == 2 :: ZGESDD :: -! LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), -! 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) -! If WHTSVD == 3 :: ZGESVDQ :: -! LRWORK_SVD = obtainable by a query -! If WHTSVD == 4 :: ZGEJSV :: -! LRWORK_SVD = obtainable by a query -! If on entry LRWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! real workspace length and returns it in RWORK(1). +!> \param[in] LRWORK +!> \verbatim +!> LRWORK (input) INTEGER +!> The minimal length of the workspace vector RWORK. +!> LRWORK is calculated as follows: +!> LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where +!> LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace +!> for the SVD subroutine determined by the input parameter +!> WHTSVD. +!> If WHTSVD == 1 :: ZGESVD :: +!> LRWORK_SVD = 5*MIN(M,N) +!> If WHTSVD == 2 :: ZGESDD :: +!> LRWORK_SVD = MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), +!> 2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) +!> If WHTSVD == 3 :: ZGESVDQ :: +!> LRWORK_SVD = obtainable by a query +!> If WHTSVD == 4 :: ZGEJSV :: +!> LRWORK_SVD = obtainable by a query +!> If on entry LRWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> real workspace length and returns it in RWORK(1). +!> \endverbatim !..... -! IWORK (workspace/output) INTEGER LIWORK-by-1 array -! Workspace that is required only if WHTSVD equals -! 2 , 3 or 4. (See the description of WHTSVD). -! If on entry LWORK =-1 or LIWORK=-1, then the -! minimal length of IWORK is computed and returned in -! IWORK(1). See the description of LIWORK. +!> \param[out] IWORK +!> \verbatim +!> IWORK (workspace/output) INTEGER LIWORK-by-1 array +!> Workspace that is required only if WHTSVD equals +!> 2 , 3 or 4. (See the description of WHTSVD). +!> If on entry LWORK =-1 or LIWORK=-1, then the +!> minimal length of IWORK is computed and returned in +!> IWORK(1). See the description of LIWORK. +!> \endverbatim !..... -! LIWORK (input) INTEGER -! The minimal length of the workspace vector IWORK. -! If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 -! If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) -! If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) -! If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) -! If on entry LIWORK = -1, then a workspace query is -! assumed and the procedure only computes the minimal -! and the optimal workspace lengths for ZWORK, RWORK and -! IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \param[in] LIWORK +!> \verbatim +!> LIWORK (input) INTEGER +!> The minimal length of the workspace vector IWORK. +!> If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 +!> If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) +!> If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) +!> If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) +!> If on entry LIWORK = -1, then a workspace query is +!> assumed and the procedure only computes the minimal +!> and the optimal workspace lengths for ZWORK, RWORK and +!> IWORK. See the descriptions of ZWORK, RWORK and IWORK. +!> \endverbatim !..... -! INFO (output) INTEGER -! -i < 0 :: On entry, the i-th argument had an -! illegal value -! = 0 :: Successful return. -! = 1 :: Void input. Quick exit (M=0 or N=0). -! = 2 :: The SVD computation of X did not converge. -! Suggestion: Check the input data and/or -! repeat with different WHTSVD. -! = 3 :: The computation of the eigenvalues did not -! converge. -! = 4 :: If data scaling was requested on input and -! the procedure found inconsistency in the data -! such that for some column index i, -! X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set -! to zero if JOBS=='C'. The computation proceeds -! with original or modified data and warning -! flag is set with INFO=4. +!> \param[out] INFO +!> \verbatim +!> INFO (output) INTEGER +!> -i < 0 :: On entry, the i-th argument had an +!> illegal value +!> = 0 :: Successful return. +!> = 1 :: Void input. Quick exit (M=0 or N=0). +!> = 2 :: The SVD computation of X did not converge. +!> Suggestion: Check the input data and/or +!> repeat with different WHTSVD. +!> = 3 :: The computation of the eigenvalues did not +!> converge. +!> = 4 :: If data scaling was requested on input and +!> the procedure found inconsistency in the data +!> such that for some column index i, +!> X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set +!> to zero if JOBS=='C'. The computation proceeds +!> with original or modified data and warning +!> flag is set with INFO=4. +!> \endverbatim +! +! Authors: +! ======== +! +!> \author Zlatko Drmac +! +!> \ingroup gedmd +! !............................................................. !............................................................. + SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF, WHTSVD, & + M, N, X, LDX, Y, LDY, NRNK, TOL, & + K, EIGS, Z, LDZ, RES, B, LDB, & + W, LDW, S, LDS, ZWORK, LZWORK, & + RWORK, LRWORK, IWORK, LIWORK, INFO ) +! +! -- LAPACK driver routine -- +! +! -- LAPACK is a software package provided by University of -- +! -- Tennessee, University of California Berkeley, University of -- +! -- Colorado Denver and NAG Ltd.. -- +! +!..... + USE iso_fortran_env + IMPLICIT NONE + INTEGER, PARAMETER :: WP = real64 +! +! Scalar arguments +! ~~~~~~~~~~~~~~~~ + CHARACTER, INTENT(IN) :: JOBS, JOBZ, JOBR, JOBF + INTEGER, INTENT(IN) :: WHTSVD, M, N, LDX, LDY, & + NRNK, LDZ, LDB, LDW, LDS, & + LIWORK, LRWORK, LZWORK + INTEGER, INTENT(OUT) :: K, INFO + REAL(KIND=WP), INTENT(IN) :: TOL +! +! Array arguments +! ~~~~~~~~~~~~~~~ + COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*) + COMPLEX(KIND=WP), INTENT(OUT) :: Z(LDZ,*), B(LDB,*), & + W(LDW,*), S(LDS,*) + COMPLEX(KIND=WP), INTENT(OUT) :: EIGS(*) + COMPLEX(KIND=WP), INTENT(OUT) :: ZWORK(*) + REAL(KIND=WP), INTENT(OUT) :: RES(*) + REAL(KIND=WP), INTENT(OUT) :: RWORK(*) + INTEGER, INTENT(OUT) :: IWORK(*) +! ! Parameters ! ~~~~~~~~~~ REAL(KIND=WP), PARAMETER :: ONE = 1.0_WP REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP COMPLEX(KIND=WP), PARAMETER :: ZONE = ( 1.0_WP, 0.0_WP ) COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP ) - +! ! Local scalars ! ~~~~~~~~~~~~~ REAL(KIND=WP) :: OFL, ROOTSC, SCALE, SMALL, & @@ -401,7 +551,7 @@ ! Local arrays ! ~~~~~~~~~~~~ REAL(KIND=WP) :: RDUMMY(2) - +! ! External functions (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~ REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2 @@ -409,13 +559,13 @@ INTEGER IZAMAX LOGICAL DISNAN, LSAME EXTERNAL DISNAN, LSAME - +! ! External subroutines (BLAS and LAPACK) ! ~~~~~~~~~~~~~~~~~~~~ EXTERNAL ZAXPY, ZGEMM, ZDSCAL EXTERNAL ZGEEV, ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, & ZLACPY, ZLASCL, ZLASSQ, XERBLA - +! ! Intrinsic functions ! ~~~~~~~~~~~~~~~~~~~ INTRINSIC DBLE, INT, MAX, SQRT @@ -608,7 +758,8 @@ K = 0 DO i = 1, N !WORK(i) = DZNRM2( M, X(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 @@ -681,7 +832,8 @@ ! carefully computed using ZLASSQ. DO i = 1, N !RWORK(i) = DZNRM2( M, Y(1,i), 1 ) - SCALE = ZERO + SSUM = ONE + SCALE = ZERO CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM ) IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN K = 0 From 283713e4c541d9e77b4d3922bdc40245a8354a89 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 23:32:45 +0100 Subject: [PATCH 506/718] Add tests for ?GEDMD (Reference-LAPACK PR 959) --- lapack-netlib/TESTING/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index b4e2223f7..d4e6f970d 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -54,6 +54,9 @@ add_lapack_test(sgqr.out gqr.in xeigtsts) add_lapack_test(sgsv.out gsv.in xeigtsts) add_lapack_test(scsd.out csd.in xeigtsts) add_lapack_test(slse.out lse.in xeigtsts) +# +# ======== SINGLE DMD EIG TESTS =========================== +add_lapack_test(sdmd.out sdmd.in xdmdeigtsts) endif() if(BUILD_DOUBLE) @@ -85,6 +88,9 @@ add_lapack_test(dgqr.out gqr.in xeigtstd) add_lapack_test(dgsv.out gsv.in xeigtstd) add_lapack_test(dcsd.out csd.in xeigtstd) add_lapack_test(dlse.out lse.in xeigtstd) +# +# ======== DOUBLE DMD EIG TESTS =========================== +add_lapack_test(ddmd.out ddmd.in xdmdeigtstd) endif() if(BUILD_COMPLEX) @@ -114,6 +120,9 @@ add_lapack_test(cgqr.out gqr.in xeigtstc) add_lapack_test(cgsv.out gsv.in xeigtstc) add_lapack_test(ccsd.out csd.in xeigtstc) add_lapack_test(clse.out lse.in xeigtstc) +# +# ======== COMPLEX DMD EIG TESTS =========================== +add_lapack_test(cdmd.out cdmd.in xdmdeigtstc) endif() if(BUILD_COMPLEX16) @@ -145,6 +154,9 @@ add_lapack_test(zgqr.out gqr.in xeigtstz) add_lapack_test(zgsv.out gsv.in xeigtstz) add_lapack_test(zcsd.out csd.in xeigtstz) add_lapack_test(zlse.out lse.in xeigtstz) +# +# ======== COMPLEX16 DMD EIG TESTS =========================== +add_lapack_test(zdmd.out zdmd.in xdmdeigtstz) endif() From c6fa9210278da4c505aec2fab8244ad009b1f944 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Dec 2023 23:39:53 +0100 Subject: [PATCH 507/718] Add tests for ?GEDMD (Reference-LAPACK PR 959) --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index d252c7fa9..e7236677a 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -42,6 +42,8 @@ set(SEIGTST schkee.F sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f sstt22.f ssyl01.f ssyt21.f ssyt22.f) +set(SDMDEIGTST schkdmd.f90) + set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f @@ -59,6 +61,8 @@ set(CEIGTST cchkee.F csgt01.f cslect.f csyl01.f cstt21.f cstt22.f cunt01.f cunt03.f) +set(CDMDEIGTST cchkdmd.f90) + set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f dsvdch.f dsvdct.f dsxt1.f) @@ -79,6 +83,8 @@ set(DEIGTST dchkee.F dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f dstt22.f dsyl01.f dsyt21.f dsyt22.f) +set(DDMDEIGTST dchkdmd.f90) + set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f @@ -96,6 +102,8 @@ set(ZEIGTST zchkee.F zsgt01.f zslect.f zsyl01.f zstt21.f zstt22.f zunt01.f zunt03.f) +set(ZDMDEIGTST zchkdmd.f90) + macro(add_eig_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) @@ -104,16 +112,20 @@ endmacro() if(BUILD_SINGLE) add_eig_executable(xeigtsts ${SEIGTST} ${SCIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtsts ${SDMDEIGTST}) endif() if(BUILD_COMPLEX) add_eig_executable(xeigtstc ${CEIGTST} ${SCIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstc ${CDMDEIGTST}) endif() if(BUILD_DOUBLE) add_eig_executable(xeigtstd ${DEIGTST} ${DZIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstd ${DDMDEIGTST}) endif() if(BUILD_COMPLEX16) add_eig_executable(xeigtstz ${ZEIGTST} ${DZIGTST} ${AEIGTST}) +add_eig_executable(xdmdeigtstz ${ZDMDEIGTST}) endif() From 0baf462dbca147a82db9412b521459b60e417d53 Mon Sep 17 00:00:00 2001 From: Wu Xiaotian Date: Wed, 20 Dec 2023 10:34:47 +0800 Subject: [PATCH 508/718] Fix: build failed on LoongArch According to the documentation at https://github.com/loongson/la-abi-specs/blob/release/lapcs.adoc#the-base-abi-variants, valid -mabi parameters are lp64s, lp64f, lp64d, ilp32s, ilp32f and ilp32d. --- cmake/cc.cmake | 14 ++++++++++++-- cmake/fc.cmake | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 00952e810..2da941afb 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS if (LOONGARCH64) if (BINARY64) - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d") + else() + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + endif () else () - set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d") + else() + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + endif () endif () set(BINARY_DEFINED 1) endif () diff --git a/cmake/fc.cmake b/cmake/fc.cmake index c496f6368..5c30be843 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -61,9 +61,19 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () if (LOONGARCH64) if (BINARY64) - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + endif () else () - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + endif () endif () endif () if (RISCV64) From d2f4f1b28aa4c00a4bc4bc1b512f162b349cae4c Mon Sep 17 00:00:00 2001 From: Xiaotian Wu Date: Wed, 20 Dec 2023 14:13:04 +0800 Subject: [PATCH 509/718] CI: update toolchains for LoongArch64 --- .github/workflows/loongarch64.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index 4a9bf98b6..42393eb0c 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -16,13 +16,13 @@ jobs: include: - target: LOONGSONGENERIC triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSONGENERIC + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC - target: LOONGSON3R5 triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSON3R5 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 - target: LOONGSON2K1000 triple: loongarch64-unknown-linux-gnu - opts: NO_SHARED=1 TARGET=LOONGSON2K1000 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 - target: DYNAMIC_ARCH triple: loongarch64-unknown-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC @@ -40,8 +40,8 @@ jobs: - name: Download and install loongarch64-toolchain run: | - wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz - tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt + wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt - name: Set env run: | From 1106460bb3f5b6d17a77acafaf1fa7ba6051acbd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Dec 2023 12:29:56 +0100 Subject: [PATCH 510/718] remove redundant targets from the default ARM64 DYNAMIC_ARCH list --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index ebdc5a833..eb974456b 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) endif () From e7a895e7140f538d14ea5b9354d33840b76e44d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Dec 2023 12:36:05 +0100 Subject: [PATCH 511/718] Add Apple M as NeoverseN1 --- driver/others/dynamic_arm64.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index b5fb8161d..803e0b5eb 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -247,6 +247,10 @@ static gotoblas_t *get_coretype(void) { int implementer, variant, part, arch, revision, midr_el1; char coremsg[128]; +#if defined (OS_DARWIN) + return &gotoblas_NEOVERSEN1; +#endif + #if (!defined OS_LINUX && !defined OS_ANDROID) return NULL; #else @@ -352,6 +356,9 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_FALKOR; } break; + case 0x61: // Apple + return &gotoblas_NEOVERSEN1; + break; default: snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); From c6996a80e950535306e281f7f2da54f387805d4e Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Fri, 8 Dec 2023 16:06:17 +0800 Subject: [PATCH 512/718] loongarch64: Refine amax,amin,max,min optimization. --- common_loongarch64.h | 24 +++ kernel/loongarch64/KERNEL.LOONGSON2K1000 | 16 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 16 +- kernel/loongarch64/amax_lasx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/amax_lsx.S | 231 ++++++++++++++++++++++ kernel/loongarch64/amin_lasx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/amin_lsx.S | 232 +++++++++++++++++++++++ kernel/loongarch64/damax_lasx.S | 183 ------------------ kernel/loongarch64/damax_lsx.S | 145 -------------- kernel/loongarch64/damin_lasx.S | 178 ----------------- kernel/loongarch64/damin_lsx.S | 145 -------------- kernel/loongarch64/dmax_lasx.S | 175 ----------------- kernel/loongarch64/dmax_lsx.S | 141 -------------- kernel/loongarch64/dmin_lasx.S | 175 ----------------- kernel/loongarch64/dmin_lsx.S | 143 -------------- kernel/loongarch64/max_lasx.S | 229 ++++++++++++++++++++++ kernel/loongarch64/max_lsx.S | 228 ++++++++++++++++++++++ kernel/loongarch64/min_lasx.S | 229 ++++++++++++++++++++++ kernel/loongarch64/min_lsx.S | 228 ++++++++++++++++++++++ kernel/loongarch64/samax_lasx.S | 208 -------------------- kernel/loongarch64/samax_lsx.S | 177 ----------------- kernel/loongarch64/samin_lasx.S | 208 -------------------- kernel/loongarch64/samin_lsx.S | 177 ----------------- kernel/loongarch64/smax_lasx.S | 205 -------------------- kernel/loongarch64/smax_lsx.S | 171 ----------------- kernel/loongarch64/smin_lasx.S | 205 -------------------- kernel/loongarch64/smin_lsx.S | 174 ----------------- 27 files changed, 1881 insertions(+), 2826 deletions(-) create mode 100644 kernel/loongarch64/amax_lasx.S create mode 100644 kernel/loongarch64/amax_lsx.S create mode 100644 kernel/loongarch64/amin_lasx.S create mode 100644 kernel/loongarch64/amin_lsx.S delete mode 100644 kernel/loongarch64/damax_lasx.S delete mode 100644 kernel/loongarch64/damax_lsx.S delete mode 100644 kernel/loongarch64/damin_lasx.S delete mode 100644 kernel/loongarch64/damin_lsx.S delete mode 100644 kernel/loongarch64/dmax_lasx.S delete mode 100644 kernel/loongarch64/dmax_lsx.S delete mode 100644 kernel/loongarch64/dmin_lasx.S delete mode 100644 kernel/loongarch64/dmin_lsx.S create mode 100644 kernel/loongarch64/max_lasx.S create mode 100644 kernel/loongarch64/max_lsx.S create mode 100644 kernel/loongarch64/min_lasx.S create mode 100644 kernel/loongarch64/min_lsx.S delete mode 100644 kernel/loongarch64/samax_lasx.S delete mode 100644 kernel/loongarch64/samax_lsx.S delete mode 100644 kernel/loongarch64/samin_lasx.S delete mode 100644 kernel/loongarch64/samin_lsx.S delete mode 100644 kernel/loongarch64/smax_lasx.S delete mode 100644 kernel/loongarch64/smax_lsx.S delete mode 100644 kernel/loongarch64/smin_lasx.S delete mode 100644 kernel/loongarch64/smin_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 4963b2f07..72e900f77 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -120,6 +120,10 @@ static inline int WhereAmI(void){ #define CMOVT fsel #define MTC movgr2fr.d #define FABS fabs.d +#define FMIN fmin.d +#define FMINA fmina.d +#define FMAX fmax.d +#define FMAXA fmaxa.d #define CMPEQ fcmp.ceq.d #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d @@ -128,10 +132,18 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.d #define XVFADD xvfadd.d #define XVFMADD xvfmadd.d +#define XVFMIN xvfmin.d +#define XVFMINA xvfmina.d +#define XVFMAX xvfmax.d +#define XVFMAXA xvfmaxa.d #define VFSUB vfsub.d #define VFADD vfadd.d #define VFMADD vfmadd.d +#define VFMIN vfmin.d +#define VFMINA vfmina.d +#define VFMAX vfmax.d +#define VFMAXA vfmaxa.d #else @@ -148,6 +160,10 @@ static inline int WhereAmI(void){ #define CMOVT fsel #define MTC movgr2fr.w #define FABS fabs.s +#define FMIN fmin.s +#define FMINA fmina.s +#define FMAX fmax.s +#define FMAXA fmaxa.s #define CMPEQ fcmp.ceq.s #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s @@ -156,10 +172,18 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.s #define XVFADD xvfadd.s #define XVFMADD xvfmadd.s +#define XVFMIN xvfmin.s +#define XVFMINA xvfmina.s +#define XVFMAX xvfmax.s +#define XVFMAXA xvfmaxa.s #define VFSUB vfsub.s #define VFADD vfadd.s #define VFMADD vfmadd.s +#define VFMIN vfmin.s +#define VFMINA vfmina.s +#define VFMAX vfmax.s +#define VFMAXA vfmaxa.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 1e4fa7a9d..802dd1c9b 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -7,17 +7,17 @@ DDOTKERNEL = dot_lsx.S SSCALKERNEL = sscal_lsx.S DSCALKERNEL = dscal_lsx.S -SAMAXKERNEL = samax_lsx.S -DAMAXKERNEL = damax_lsx.S +SAMAXKERNEL = amax_lsx.S +DAMAXKERNEL = amax_lsx.S -SAMINKERNEL = samin_lsx.S -DAMINKERNEL = damin_lsx.S +SAMINKERNEL = amin_lsx.S +DAMINKERNEL = amin_lsx.S -SMAXKERNEL = smax_lsx.S -DMAXKERNEL = dmax_lsx.S +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S -SMINKERNEL = smin_lsx.S -DMINKERNEL = dmin_lsx.S +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S ISMAXKERNEL = ismax_lsx.S IDMAXKERNEL = idmax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index f00abcb32..3253489d9 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -7,17 +7,17 @@ DDOTKERNEL = dot_lasx.S SSCALKERNEL = sscal_lasx.S DSCALKERNEL = dscal_lasx.S -SAMAXKERNEL = samax_lasx.S -DAMAXKERNEL = damax_lasx.S +SAMAXKERNEL = amax_lasx.S +DAMAXKERNEL = amax_lasx.S -SAMINKERNEL = samin_lasx.S -DAMINKERNEL = damin_lasx.S +SAMINKERNEL = amin_lasx.S +DAMINKERNEL = amin_lasx.S -SMAXKERNEL = smax_lasx.S -DMAXKERNEL = dmax_lasx.S +SMAXKERNEL = max_lsx.S +DMAXKERNEL = max_lsx.S -SMINKERNEL = smin_lasx.S -DMINKERNEL = dmin_lasx.S +SMINKERNEL = min_lsx.S +DMINKERNEL = min_lsx.S ISMAXKERNEL = ismax_lasx.S IDMAXKERNEL = idmax_lasx.S diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S new file mode 100644 index 000000000..e964d4ddb --- /dev/null +++ b/kernel/loongarch64/amax_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM0, VM1 + XVFMAXA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMAXA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMAXA VM1, VX0, VX1 + XVFMAXA VM2, VX2, VX3 + XVFMAXA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMAXA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S new file mode 100644 index 000000000..fb3b77a0e --- /dev/null +++ b/kernel/loongarch64/amax_lsx.S @@ -0,0 +1,231 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM0, VM1 + VFMAXA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMAXA VM1, VX0, VX1 + VFMAXA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMAXA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMAXA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMAXA VM1, VX0, VX1 + VFMAXA VM2, VX2, VX3 + VFMAXA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMAXA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S new file mode 100644 index 000000000..0a4359002 --- /dev/null +++ b/kernel/loongarch64/amin_lasx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $xr0 +#define VM1 $xr1 +#define VM2 $xr2 +#define VX0 $xr3 +#define VX1 $xr4 +#define VX2 $xr5 +#define VX3 $xr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + xvldrepl.d VM0, X, 0 +#else + xvldrepl.w VM0, X, 0 +#endif + XVFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 4 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvld VX2, X, 64 + xvld VX3, X, 96 + addi.d I, I, -1 + addi.d X, X, 128 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM0, VM1 + XVFMINA VM0, VM0, VM2 +#else + xvld VX0, X, 0 + xvld VX1, X, 32 + addi.d I, I, -1 + addi.d X, X, 64 + XVFMINA VM1, VX0, VX1 + XVFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 0x1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L11: + andi I, N, 0x0f + bge $r0, I, .L13 + .align 3 + +.L12: /* 0 < N < 16 */ + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 0 + xvinsgr2vr.w VM1, t2, 1 + xvinsgr2vr.w VM1, t3, 2 + xvinsgr2vr.w VM1, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VM1, t1, 4 + xvinsgr2vr.w VM1, t2, 5 + xvinsgr2vr.w VM1, t3, 6 + xvinsgr2vr.w VM1, t4, 7 + xvfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvrepl128vei.d VX0, VM0, 0 + xvrepl128vei.d VX1, VM0, 1 + XVFMINA VM0, VX0, VX1 +#else + xvrepl128vei.w VX0, VM0, 0 + xvrepl128vei.w VX1, VM0, 1 + xvrepl128vei.w VX2, VM0, 2 + xvrepl128vei.w VX3, VM0, 3 + XVFMINA VM1, VX0, VX1 + XVFMINA VM2, VX2, VX3 + XVFMINA VM0, VM1, VM2 +#endif + xvpermi.q VM1, VM0, 1 + XVFMINA VM0, VM0, VM1 + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: /* 0 < N < 8 */ + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S new file mode 100644 index 000000000..644caf43c --- /dev/null +++ b/kernel/loongarch64/amin_lsx.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r12 +#define TEMP $r13 + +#define VM0 $vr0 +#define VM1 $vr1 +#define VM2 $vr2 +#define VX0 $vr3 +#define VX1 $vr4 +#define VX2 $vr5 +#define VX3 $vr6 + +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r17 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT +#ifdef DOUBLE + vldrepl.d VM0, X, 0 +#else + vldrepl.w VM0, X, 0 +#endif + VFSUB VM0, VM0, VM0 + bne INCX, TEMP, .L20 + + srai.d I, N, 3 + bge $r0, I, .L11 + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, X, 32 + vld VX3, X, 48 + addi.d I, I, -1 + addi.d X, X, 64 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM0, VM1 + VFMINA VM0, VM0, VM2 +#else + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + addi.d X, X, 32 + VFMINA VM1, VX0, VX1 + VFMINA VM0, VM0, VM1 +#endif + blt $r0, I, .L10 + +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L11: + andi I, N, 7 + bge $r0, I, .L13 + .align 3 + +.L12: + LD $f1, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + FMINA $f0, $f0, $f1 + bnez I, .L12 + .align 3 + +.L13: + FABS $f0, $f0 + SUB $f0, $f0, $f0 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + srai.d I, N, 3 + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM1, VM1, VM2 + vfmaxa.d VM0, VM0, VM1 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmaxa.s VM1, VX0, VX1 + vfmaxa.s VM0, VM0, VM1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + VFMINA VM0, VX0, VX1 +#else + vreplvei.w VX0, VM0, 0 + vreplvei.w VX1, VM0, 1 + vreplvei.w VX2, VM0, 2 + vreplvei.w VX3, VM0, 3 + VFMINA VM1, VX0, VX1 + VFMINA VM2, VX2, VX3 + VFMINA VM0, VM1, VM2 +#endif + .align 3 + +.L23: //INCX!=1 and N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f1, X, 0 + addi.d I, I, -1 + add.d X, X, INCX + FMINA $f0, $f0, $f1 + bnez I, .L24 + .align 3 + +.L999: + FABS $f0, $f0 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/damax_lasx.S b/kernel/loongarch64/damax_lasx.S deleted file mode 100644 index c44ce4995..000000000 --- a/kernel/loongarch64/damax_lasx.S +++ /dev/null @@ -1,183 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define J $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define TEMP $r16 -#define m0 $xr8 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define x5 $xr13 -#define x6 $xr14 -#define x7 $xr15 -#define x8 $xr16 -#define VX0 $xr20 -#define VX1 $xr21 -#define VM0 $xr22 -#define VM1 $xr23 -#define VM2 $xr18 -#define VM3 $xr19 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - srai.d I, N, 3 - bge $r0, I, .L12 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - addi.d I, I, -1 - xvfmaxa.d VM1, VX1, VX0 - addi.d X, X, 8 * SIZE - xvfmaxa.d VM0, VM0, VM1 - blt $r0, I, .L10 - .align 3 - -.L11: - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfmaxa.d VM2, x3, x4 - xvfmaxa.d VM0, VM1, VM2 - .align 3 - -.L12: //INCX==1 and N<8 - andi I, N, 7 - li.d J, 4 - bge J, I, .L13 // 4 Date: Wed, 27 Dec 2023 11:30:17 +0800 Subject: [PATCH 513/718] loongarch64: Refine copy,swap,nrm2,sum optimization. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 12 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 12 +- kernel/loongarch64/copy_lasx.S | 306 ++++++++++++++++ kernel/loongarch64/copy_lsx.S | 316 +++++++++++++++++ kernel/loongarch64/dcopy_lasx.S | 224 ------------ kernel/loongarch64/dcopy_lsx.S | 232 ------------ kernel/loongarch64/dnrm2_lasx.S | 124 ++++--- kernel/loongarch64/dnrm2_lsx.S | 84 +++-- kernel/loongarch64/dsum_lasx.S | 125 ------- kernel/loongarch64/dsum_lsx.S | 123 ------- kernel/loongarch64/dswap_lasx.S | 301 ---------------- kernel/loongarch64/dswap_lsx.S | 317 ----------------- kernel/loongarch64/scopy_lasx.S | 216 ------------ kernel/loongarch64/scopy_lsx.S | 220 ------------ kernel/loongarch64/snrm2_lasx.S | 116 +++--- kernel/loongarch64/snrm2_lsx.S | 141 ++++---- kernel/loongarch64/ssum_lasx.S | 140 -------- kernel/loongarch64/ssum_lsx.S | 125 ------- kernel/loongarch64/sswap_lasx.S | 286 --------------- kernel/loongarch64/sswap_lsx.S | 294 ---------------- kernel/loongarch64/sum_lasx.S | 225 ++++++++++++ kernel/loongarch64/sum_lsx.S | 204 +++++++++++ kernel/loongarch64/swap_lasx.S | 401 +++++++++++++++++++++ kernel/loongarch64/swap_lsx.S | 431 +++++++++++++++++++++++ 24 files changed, 2159 insertions(+), 2816 deletions(-) create mode 100644 kernel/loongarch64/copy_lasx.S create mode 100644 kernel/loongarch64/copy_lsx.S delete mode 100644 kernel/loongarch64/dcopy_lasx.S delete mode 100644 kernel/loongarch64/dcopy_lsx.S delete mode 100644 kernel/loongarch64/dsum_lasx.S delete mode 100644 kernel/loongarch64/dsum_lsx.S delete mode 100644 kernel/loongarch64/dswap_lasx.S delete mode 100644 kernel/loongarch64/dswap_lsx.S delete mode 100644 kernel/loongarch64/scopy_lasx.S delete mode 100644 kernel/loongarch64/scopy_lsx.S delete mode 100644 kernel/loongarch64/ssum_lasx.S delete mode 100644 kernel/loongarch64/ssum_lsx.S delete mode 100644 kernel/loongarch64/sswap_lasx.S delete mode 100644 kernel/loongarch64/sswap_lsx.S create mode 100644 kernel/loongarch64/sum_lasx.S create mode 100644 kernel/loongarch64/sum_lsx.S create mode 100644 kernel/loongarch64/swap_lasx.S create mode 100644 kernel/loongarch64/swap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 802dd1c9b..cb230b348 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lsx.S ISAMINKERNEL = isamin_lsx.S IDAMINKERNEL = idamin_lsx.S -SCOPYKERNEL = scopy_lsx.S -DCOPYKERNEL = dcopy_lsx.S +SCOPYKERNEL = copy_lsx.S +DCOPYKERNEL = copy_lsx.S -SSWAPKERNEL = sswap_lsx.S -DSWAPKERNEL = dswap_lsx.S +SSWAPKERNEL = swap_lsx.S +DSWAPKERNEL = swap_lsx.S SAXPYKERNEL = saxpy_lsx.S DAXPYKERNEL = daxpy_lsx.S @@ -43,8 +43,8 @@ DAXPYKERNEL = daxpy_lsx.S SAXPBYKERNEL = saxpby_lsx.S DAXPBYKERNEL = daxpby_lsx.S -SSUMKERNEL = ssum_lsx.S -DSUMKERNEL = dsum_lsx.S +SSUMKERNEL = sum_lsx.S +DSUMKERNEL = sum_lsx.S SASUMKERNEL = sasum_lsx.S DASUMKERNEL = dasum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 3253489d9..ba59c4566 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lasx.S ISAMINKERNEL = isamin_lasx.S IDAMINKERNEL = idamin_lasx.S -SCOPYKERNEL = scopy_lasx.S -DCOPYKERNEL = dcopy_lasx.S +SCOPYKERNEL = copy_lasx.S +DCOPYKERNEL = copy_lasx.S -SSWAPKERNEL = sswap_lasx.S -DSWAPKERNEL = dswap_lasx.S +SSWAPKERNEL = swap_lasx.S +DSWAPKERNEL = swap_lasx.S SAXPYKERNEL = saxpy_lasx.S DAXPYKERNEL = daxpy_lasx.S @@ -43,8 +43,8 @@ DAXPYKERNEL = daxpy_lasx.S SAXPBYKERNEL = saxpby_lasx.S DAXPBYKERNEL = daxpby_lasx.S -SSUMKERNEL = ssum_lasx.S -DSUMKERNEL = dsum_lasx.S +SSUMKERNEL = sum_lasx.S +DSUMKERNEL = sum_lasx.S SASUMKERNEL = sasum_lasx.S DASUMKERNEL = dasum_lasx.S diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S new file mode 100644 index 000000000..31f91cec1 --- /dev/null +++ b/kernel/loongarch64/copy_lasx.S @@ -0,0 +1,306 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + addi.d I, I, -1 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + xvld VX1, X, 32 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0, 3 + add.d Y, Y, INCY +#else + xvld VX0, X, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0, 7 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvst VX0, Y, 0 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvst VX1, Y, 32 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvst VX0, Y, 0 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S new file mode 100644 index 000000000..bb10f3565 --- /dev/null +++ b/kernel/loongarch64/copy_lsx.S @@ -0,0 +1,316 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + addi.d I, I, -1 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + vld VX0, X, 32 + vld VX1, X, 48 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY +#else + vld VX0, X, 0 + vld VX1, X, 16 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0, 3 + add.d Y, Y, INCY +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + addi.d I, I, -1 + addi.d X, X, SIZE + ST $f12, Y, 0 + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 + vst VX1, Y, 16 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 32 + vst VX1, Y, 48 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vst VX0, Y, 0 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX1, Y, 16 +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +/* INCX!=1 and INCY!=1 */ +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + ST a1, Y, 0 + add.d Y, Y, INCY + ST a2, Y, 0 + add.d Y, Y, INCY + ST a3, X, 0 + add.d Y, Y, INCY + ST a4, X, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dcopy_lasx.S b/kernel/loongarch64/dcopy_lasx.S deleted file mode 100644 index 9d7da4a80..000000000 --- a/kernel/loongarch64/dcopy_lasx.S +++ /dev/null @@ -1,224 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $xr12 -#define VX1 $xr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvst VX0, Y, 0 * SIZE - xvst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.d VX0, Y, 0, 3 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.d VX1, Y, 0, 3 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvst VX0, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dcopy_lsx.S b/kernel/loongarch64/dcopy_lsx.S deleted file mode 100644 index 161655bbd..000000000 --- a/kernel/loongarch64/dcopy_lsx.S +++ /dev/null @@ -1,232 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $vr12 -#define VX1 $vr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - addi.d I, I, -1 - vst VX0, Y, 4 * SIZE - vst VX1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.d $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vst VX0, Y, 0 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - vst VX1, Y, 2 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vst VX0, Y, 4 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - vst VX1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.d a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S index 2a9c3cf7b..5a6f7cf1e 100644 --- a/kernel/loongarch64/dnrm2_lasx.S +++ b/kernel/loongarch64/dnrm2_lasx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -12,6 +44,8 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VM0 $xr17 @@ -35,6 +69,7 @@ xvxor.v res1, res1, res1 xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 bge $r0, N, .L999 beq $r0, INCX, .L999 move XX, X @@ -46,12 +81,11 @@ slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - xvld VM0, X, 0 bge $r0, I, .L97 .align 3 .L10: - xvld VX0, X, 0 * SIZE + xvld VX0, X, 0 xvld VX1, X, 4 * SIZE xvfmaxa.d VM1, VX1, VX0 xvfmaxa.d VM0, VM0, VM1 @@ -62,40 +96,32 @@ .align 3 .L20: // INCX!=1 - move TEMP, X // initialize the maxa value - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 bge $r0, I, .L97 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 .align 3 .L21: - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t2, 1 - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t3, 2 - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 add.d X, X, INCX xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t2, 1 - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t3, 2 - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 add.d X, X, INCX xvinsgr2vr.d VX1, t4, 3 xvfmaxa.d VM1, VX0, VX1 @@ -109,9 +135,9 @@ xvpickve.d VX0, VM0, 1 xvpickve.d VX1, VM0, 2 xvpickve.d VM3, VM0, 3 - xvfmaxa.d VM1, VX0, VX1 - xvfmaxa.d VM2, VM3, VM0 - xvfmaxa.d VM0, VM1, VM2 + fmaxa.d $f17, $f17, $f14 + fmaxa.d $f17, $f17, $f15 + fmaxa.d $f17, $f17, $f16 .align 3 .L97: @@ -149,12 +175,12 @@ .align 3 .L110: - xvld VX0, XX, 0 * SIZE + xvld VX0, XX, 0 xvld VX1, XX, 4 * SIZE - xvfmul.d VM0, VX0, VALPHA - xvfmul.d VM1, VX1, VALPHA - xvfmadd.d res1, VM0, VM0, res1 - xvfmadd.d res2, VM1, VM1, res2 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 addi.d XX, XX, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L110 @@ -166,34 +192,34 @@ bge $r0, I, .L997 .L121: - ld.d t1, XX, 0 * SIZE + ld.d t1, XX, 0 add.d XX, XX, INCX - ld.d t2, XX, 0 * SIZE + ld.d t2, XX, 0 add.d XX, XX, INCX - ld.d t3, XX, 0 * SIZE + ld.d t3, XX, 0 add.d XX, XX, INCX - ld.d t4, XX, 0 * SIZE + ld.d t4, XX, 0 add.d XX, XX, INCX xvinsgr2vr.d VX0, t1, 0 xvinsgr2vr.d VX0, t2, 1 xvinsgr2vr.d VX0, t3, 2 xvinsgr2vr.d VX0, t4, 3 - ld.d t1, XX, 0 * SIZE + ld.d t1, XX, 0 add.d XX, XX, INCX - ld.d t2, XX, 0 * SIZE + ld.d t2, XX, 0 add.d XX, XX, INCX - ld.d t3, XX, 0 * SIZE + ld.d t3, XX, 0 add.d XX, XX, INCX - ld.d t4, XX, 0 * SIZE + ld.d t4, XX, 0 add.d XX, XX, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 xvinsgr2vr.d VX1, t3, 2 xvinsgr2vr.d VX1, t4, 3 - xvfmul.d VM0, VX0, VALPHA - xvfmul.d VM1, VX1, VALPHA - xvfmadd.d res1, VM0, VM0, res1 - xvfmadd.d res2, VM1, VM1, res2 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 addi.d I, I, -1 blt $r0, I, .L121 b .L996 @@ -203,10 +229,10 @@ xvfadd.d res1, res1, res2 xvpickve.d VX0, res1, 1 xvpickve.d VX1, res1, 2 - xvpickve.d VM0, res1, 3 - xvfadd.d res1, VX0, res1 - xvfadd.d VX1, VX1, VM0 - xvfadd.d res1, VX1, res1 + xvpickve.d VM2, res1, 3 + fadd.d $f19, $f19, $f15 + fadd.d $f19, $f19, $f16 + fadd.d $f19, $f19, $f13 .align 3 .L997: @@ -215,19 +241,17 @@ .align 3 .L998: - fld.d $f15, XX, 0 * SIZE + fld.d $f15, XX, 0 addi.d I, I, -1 fmul.d $f15, $f15, ALPHA fmadd.d $f19, $f15, $f15, $f19 add.d XX, XX , INCX blt $r0, I, .L998 - fsqrt.d $f19, $f19 - fmul.d $f0, max, $f19 - jirl $r0, $r1, 0x0 - .align 3 .L999: - fmov.d $f0, $f19 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 jirl $r0, $r1, 0x0 + .align 3 EPILOGUE diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S index e4615e18d..fce4260e2 100644 --- a/kernel/loongarch64/dnrm2_lsx.S +++ b/kernel/loongarch64/dnrm2_lsx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -12,6 +44,8 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $vr15 #define VX1 $vr16 #define VM0 $vr17 @@ -35,6 +69,7 @@ vxor.v res1, res1, res1 vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 bge $r0, N, .L999 beq $r0, INCX, .L999 move XX, X @@ -46,7 +81,7 @@ slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - vld VM0, X, 0 + bge $r0, I, .L97 .align 3 @@ -66,15 +101,7 @@ .align 3 .L20: // INCX!=1 - move TEMP, X // initialize the maxa value - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 bge $r0, I, .L97 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 .align 3 .L21: @@ -154,16 +181,16 @@ .L110: vld VX0, XX, 0 * SIZE vld VX1, XX, 2 * SIZE - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 vld VX0, XX, 4 * SIZE vld VX1, XX, 6 * SIZE - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 addi.d XX, XX, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L110 @@ -173,6 +200,7 @@ .L120: srai.d I, N, 3 bge $r0, I, .L997 + .align 3 .L121: ld.d t1, XX, 0 * SIZE @@ -187,14 +215,14 @@ vinsgr2vr.d VX0, t2, 1 vinsgr2vr.d VX1, t3, 0 vinsgr2vr.d VX1, t4, 1 - vfmul.d VM0, VX0, VALPHA + vfmul.d VM2, VX0, VALPHA ld.d t1, XX, 0 * SIZE add.d XX, XX, INCX - vfmul.d VM1, VX1, VALPHA + vfmul.d VM3, VX1, VALPHA ld.d t2, XX, 0 * SIZE add.d XX, XX, INCX - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 ld.d t3, XX, 0 * SIZE add.d XX, XX, INCX ld.d t4, XX, 0 * SIZE @@ -203,10 +231,10 @@ vinsgr2vr.d VX0, t2, 1 vinsgr2vr.d VX1, t3, 0 vinsgr2vr.d VX1, t4, 1 - vfmul.d VM0, VX0, VALPHA - vfmul.d VM1, VX1, VALPHA - vfmadd.d res1, VM0, VM0, res1 - vfmadd.d res2, VM1, VM1, res2 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 addi.d I, I, -1 blt $r0, I, .L121 b .L996 @@ -230,13 +258,11 @@ fmadd.d $f19, $f15, $f15, $f19 add.d XX, XX , INCX blt $r0, I, .L998 - fsqrt.d $f19, $f19 - fmul.d $f0, max, $f19 - jirl $r0, $r1, 0x0 .align 3 .L999: - fmov.d $f0, $f19 + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 jirl $r0, $r1, 0x0 EPILOGUE diff --git a/kernel/loongarch64/dsum_lasx.S b/kernel/loongarch64/dsum_lasx.S deleted file mode 100644 index 3c51dab60..000000000 --- a/kernel/loongarch64/dsum_lasx.S +++ /dev/null @@ -1,125 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define res1 $xr16 -#define res2 $xr17 - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - addi.d I, I, -1 - fadd.d $f16, $f12, $f16 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dsum_lsx.S b/kernel/loongarch64/dsum_lsx.S deleted file mode 100644 index 402d087df..000000000 --- a/kernel/loongarch64/dsum_lsx.S +++ /dev/null @@ -1,123 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define res1 $vr16 -#define res2 $vr17 - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dswap_lasx.S b/kernel/loongarch64/dswap_lasx.S deleted file mode 100644 index 221cb7fa2..000000000 --- a/kernel/loongarch64/dswap_lasx.S +++ /dev/null @@ -1,301 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvld VX2, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - addi.d I, I, -1 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - xvst VX0, Y, 0 * SIZE - xvst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvstelm.d VX0, Y, 0, 3 - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvst VX2, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 1 - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 2 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvstelm.d VX1, Y, 0, 3 - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - xvstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - xvstelm.d VX2, X, 0, 1 - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - xvstelm.d VX2, X, 0, 2 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvstelm.d VX2, X, 0, 3 - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvst VX0, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - xvstelm.d VX3, X, 0, 0 - add.d X, X, INCY - ld.d t2, X, 0 * SIZE - xvstelm.d VX3, X, 0, 1 - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - xvstelm.d VX3, X, 0, 2 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvstelm.d VX3, X, 0, 3 - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - //addi.d TEMP, N, -1 - //mul.d TEMP, TEMP, INCX - //sub.d X, X, TEMP - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/dswap_lsx.S b/kernel/loongarch64/dswap_lsx.S deleted file mode 100644 index 7f7f585e1..000000000 --- a/kernel/loongarch64/dswap_lsx.S +++ /dev/null @@ -1,317 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX2, Y, 0 * SIZE - vld VX3, Y, 2 * SIZE - vst VX2, X, 0 * SIZE - vst VX3, X, 2 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX2, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - addi.d I, I, -1 - vst VX2, X, 4 * SIZE - vst VX3, X, 6 * SIZE - vst VX0, Y, 4 * SIZE - vst VX1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 1 - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vst VX2, X, 0 * SIZE - vld VX1, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 1 - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vst VX3, X, 2 * SIZE - vld VX0, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vstelm.d VX0, Y, 0, 1 - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vst VX2, X, 4 * SIZE - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vstelm.d VX1, Y, 0, 1 - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vst VX3, X, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - vstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vstelm.d VX2, X, 0, 1 - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCY - vst VX0, Y, 0 * SIZE - vld VX3, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - vstelm.d VX3, X, 0, 0 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vstelm.d VX3, X, 0, 1 - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vst VX1, Y, 2 * SIZE - vld VX2, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - vstelm.d VX2, X, 0, 0 - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vstelm.d VX2, X, 0, 1 - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCY - vst VX0, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - vstelm.d VX3, X, 0, 0 - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vstelm.d VX3, X, 0, 1 - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vst VX1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bgez INCX, .L220 - //addi.d TEMP, N, -1 - //mul.d TEMP, TEMP, INCX - //sub.d X, X, TEMP - .align 3 - -.L220: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a1, X, 0 * SIZE - add.d X, X, INCX - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b1, Y, 0 * SIZE - fst.d a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a2, X, 0 * SIZE - add.d X, X, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b2, Y, 0 * SIZE - fst.d a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.d a3, X, 0 * SIZE - add.d X, X, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b3, Y, 0 * SIZE - fst.d a3, Y, 0 * SIZE - fld.d a4, X, 0 * SIZE - add.d X, X, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.d b4, Y, 0 * SIZE - fst.d a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.d b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.d b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.d $f12, Y, 0 * SIZE - fst.d $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/scopy_lasx.S b/kernel/loongarch64/scopy_lasx.S deleted file mode 100644 index 7db1e7cee..000000000 --- a/kernel/loongarch64/scopy_lasx.S +++ /dev/null @@ -1,216 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $xr12 -#define VX1 $xr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - addi.d I, I, -1 - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - xvstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 3 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 4 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 5 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 6 - add.d Y, Y, INCY - xvstelm.w VX0, Y, 0, 7 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvst VX0, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/scopy_lsx.S b/kernel/loongarch64/scopy_lsx.S deleted file mode 100644 index 32150d3d6..000000000 --- a/kernel/loongarch64/scopy_lsx.S +++ /dev/null @@ -1,220 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define I $r17 -#define TEMP $r18 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define VX0 $vr12 -#define VX1 $vr13 - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - vstelm.w VX0, Y, 0, 3 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 0 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 1 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 2 - add.d Y, Y, INCY - vstelm.w VX1, Y, 0, 3 - add.d Y, Y, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - fst.s $f12, Y, 0 * SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vst VX0, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s a3, X, 0 * SIZE - add.d Y, Y, INCY - fst.s a4, X, 0 * SIZE - add.d Y, Y, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S index 274908c14..3ae11e897 100644 --- a/kernel/loongarch64/snrm2_lasx.S +++ b/kernel/loongarch64/snrm2_lasx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -11,10 +43,13 @@ #define t2 $r13 #define t3 $r14 #define t4 $r15 + +/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VX2 $xr17 #define VX3 $xr18 +#define VX4 $xr21 #define res1 $xr19 #define res2 $xr20 @@ -37,14 +72,13 @@ .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 0 * SIZE - xvfcvtl.d.s VX0, VX0 - xvfcvth.d.s VX1, VX1 - xvfmadd.d res1, VX0, VX0, res1 - xvfmadd.d res2, VX1, VX1, res2 + xvld VX0, X, 0 + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 8 * SIZE blt $r0, I, .L10 .align 3 b .L996 @@ -54,70 +88,46 @@ .align 3 .L21: - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX xvinsgr2vr.w VX0, t1, 0 xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX + ld.w t4, X, 0 xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX1, t1, 0 - xvinsgr2vr.w VX1, t2, 1 - xvinsgr2vr.w VX1, t3, 2 - xvinsgr2vr.w VX1, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX1, t1, 4 - xvinsgr2vr.w VX1, t2, 5 - xvinsgr2vr.w VX1, t3, 6 - xvinsgr2vr.w VX1, t4, 7 - xvfcvtl.d.s VX0, VX0 - xvfcvth.d.s VX1, VX1 - xvfmadd.d res1, VX0, VX0, res1 - xvfmadd.d res2, VX1, VX1, res2 + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 .L996: xvfadd.d res1, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + fadd.d $f19, $f19, $f16 + fadd.d $f19, $f19, $f17 + fadd.d $f19, $f19, $f18 .align 3 .L997: @@ -126,11 +136,11 @@ .align 3 .L998: - fld.s $f15, X, 0 * SIZE - addi.d I, I, -1 + fld.s $f15, X, 0 + add.d X, X, INCX + addi.d I, I, -1 fcvt.d.s $f15, $f15 - fmadd.d $f19, $f15, $f15, $f19 - add.d X, X, INCX + fmadd.d $f19, $f15, $f15, $f19 blt $r0, I, .L998 .align 3 diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S index 17d017900..bb492dbf0 100644 --- a/kernel/loongarch64/snrm2_lsx.S +++ b/kernel/loongarch64/snrm2_lsx.S @@ -1,3 +1,35 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -15,6 +47,9 @@ #define VX1 $vr16 #define VX2 $vr17 #define VX3 $vr18 +#define VX4 $vr21 +#define VX5 $vr22 +/* Don't change following FR unless you know the effects. */ #define res1 $vr19 #define res2 $vr20 @@ -24,99 +59,71 @@ LDINT N, 0(N) LDINT INCX, 0(INCX) #endif - vxor.v res1, res1, res1 vxor.v res2, res2, res2 - bge $r0, N, .L999 + bge $r0, N, .L999 beq $r0, INCX, .L999 li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 - bge $r0, I, .L997 + bge $r0, I, .L997 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 0 * SIZE - vfcvtl.d.s VX0, VX0 - vfcvth.d.s VX1, VX1 - vfmadd.d res1, VX0, VX0, res1 - vfmadd.d res2, VX1, VX1, res2 - vld VX2, X, 4 * SIZE - vld VX3, X, 4 * SIZE - vfcvtl.d.s VX2, VX2 - vfcvth.d.s VX3, VX3 - vfmadd.d res1, VX2, VX2, res1 - vfmadd.d res2, VX3, VX3, res2 + vld VX0, X, 0 + vld VX5, X, 4 * SIZE addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 8 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfcvtl.d.s VX3, VX5 + vfcvth.d.s VX4, VX5 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 blt $r0, I, .L10 b .L996 .align 3 - .L20: bge $r0, I, .L997 .align 3 .L21: - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX vinsgr2vr.w VX0, t1, 0 vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t4, 3 + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfcvtl.d.s VX0, VX0 - vfcvth.d.s VX1, VX1 - vfmadd.d res1, VX0, VX0, res1 - vfmadd.d res2, VX1, VX1, res2 - ld.w t1, X, 0 * SIZE + ld.w t2, X, 0 add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t3, X, 0 add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t4, X, 0 add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vfcvtl.d.s VX2, VX2 - vfcvth.d.s VX3, VX3 - vfmadd.d res1, VX2, VX2, res1 - vfmadd.d res2, VX3, VX3, res2 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 @@ -124,12 +131,8 @@ .L996: vfadd.d res1, res1, res2 - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 .align 3 .L997: @@ -138,7 +141,7 @@ .align 3 .L998: - fld.s $f15, X, 0 * SIZE + fld.s $f15, X, 0 addi.d I, I, -1 fcvt.d.s $f15, $f15 fmadd.d $f19, $f15, $f15, $f19 diff --git a/kernel/loongarch64/ssum_lasx.S b/kernel/loongarch64/ssum_lasx.S deleted file mode 100644 index 7cf57bc77..000000000 --- a/kernel/loongarch64/ssum_lasx.S +++ /dev/null @@ -1,140 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define res1 $xr16 -#define res2 $xr17 - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvfadd.s res1, VX0, res1 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfadd.s res1, VX0, res1 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ssum_lsx.S b/kernel/loongarch64/ssum_lsx.S deleted file mode 100644 index de63c69e3..000000000 --- a/kernel/loongarch64/ssum_lsx.S +++ /dev/null @@ -1,125 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define res1 $vr16 -#define res2 $vr17 - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sswap_lasx.S b/kernel/loongarch64/sswap_lasx.S deleted file mode 100644 index 7184eff45..000000000 --- a/kernel/loongarch64/sswap_lasx.S +++ /dev/null @@ -1,286 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - addi.d I, I, -1 - xvst VX2, X, 0 * SIZE - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 3 - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 4 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 5 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 6 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvstelm.w VX0, Y, 0, 7 - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvst VX2, X, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21: - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - xvstelm.w VX2, X, 0, 0 - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - xvstelm.w VX2, X, 0, 1 - add.d X, X, INCY - ld.w t3, X, 0 * SIZE - xvstelm.w VX2, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvstelm.w VX2, X, 0, 3 - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - xvstelm.w VX2, X, 0, 4 - add.d X, X, INCY - ld.w t2, X, 0 * SIZE - xvstelm.w VX2, X, 0, 5 - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - xvstelm.w VX2, X, 0, 6 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvstelm.w VX2, X, 0, 7 - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/sswap_lsx.S b/kernel/loongarch64/sswap_lsx.S deleted file mode 100644 index 4f19a8024..000000000 --- a/kernel/loongarch64/sswap_lsx.S +++ /dev/null @@ -1,294 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r17 -#define TEMP $r18 -#define XX $r5 -#define YY $r6 -#define t1 $r14 -#define t2 $r15 -#define t3 $r16 -#define t4 $r19 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 - - - PROLOGUE - bge $r0, N, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX2, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - addi.d I, I, -1 - vst VX2, X, 0 * SIZE - vst VX3, X, 4 * SIZE - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - .align 3 - -.L112: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L113: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L113 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vstelm.w VX0, Y, 0, 3 - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vst VX2, X, 0 * SIZE - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 0 - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 1 - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 2 - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vstelm.w VX1, Y, 0, 3 - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - vstelm.w VX2, X, 0, 0 - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - vstelm.w VX2, X, 0, 1 - add.d X, X, INCY - ld.w t3, X, 0 * SIZE - vstelm.w VX2, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vstelm.w VX2, X, 0, 3 - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vst VX0, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - vstelm.w VX3, X, 0, 0 - add.d X, X, INCY - ld.w t2, X, 0 * SIZE - vstelm.w VX3, X, 0, 1 - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - vstelm.w VX3, X, 0, 2 - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vstelm.w VX3, X, 0, 3 - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vst VX1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - .align 3 - move XX, X - -.L222: - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a1, X, 0 * SIZE - add.d X, X, INCX - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b1, Y, 0 * SIZE - fst.s a1, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a2, X, 0 * SIZE - add.d X, X, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b2, Y, 0 * SIZE - fst.s a2, Y, 0 * SIZE - add.d Y, Y, INCY - fld.s a3, X, 0 * SIZE - add.d X, X, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b3, Y, 0 * SIZE - fst.s a3, Y, 0 * SIZE - fld.s a4, X, 0 * SIZE - add.d X, X, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - fld.s b4, Y, 0 * SIZE - fst.s a4, Y, 0 * SIZE - add.d Y, Y, INCY - fst.s b1, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b2, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b3, XX, 0 * SIZE - add.d XX, XX, INCX - fst.s b4, XX, 0 * SIZE - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fst.s $f12, Y, 0 * SIZE - fst.s $f14, X, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/sum_lasx.S b/kernel/loongarch64/sum_lasx.S new file mode 100644 index 000000000..fd6d5adb3 --- /dev/null +++ b/kernel/loongarch64/sum_lasx.S @@ -0,0 +1,225 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + xvld VX0, X, 0 + xvfadd.s res1, res1, VX0 +#ifdef DOUBLE + xvld VX1, X, 32 + xvfadd.s res1, res1, VX1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/sum_lsx.S b/kernel/loongarch64/sum_lsx.S new file mode 100644 index 000000000..6b2027781 --- /dev/null +++ b/kernel/loongarch64/sum_lsx.S @@ -0,0 +1,204 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: + vld VX0, X, 0 + vld VX1, X, 16 + VFADD res2, VX0, VX1 + VFADD res1, res1, res2 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + VFADD res2, VX0, VX1 + VFADD res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, res1, VX1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 + add.d X, X, INCX + ld.d t2, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 + add.d X, X, INCX + ld.d t4, X, 0 + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S new file mode 100644 index 000000000..4767fffe3 --- /dev/null +++ b/kernel/loongarch64/swap_lasx.S @@ -0,0 +1,401 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and INCY==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 + xvld VX2, Y, 0 + addi.d I, I, -1 + xvst VX2, X, 0 + xvst VX0, Y, 0 +#ifdef DOUBLE + xvld VX0, X, 32 + xvld VX2, Y, 32 + xvst VX2, X, 32 + xvst VX0, Y, 32 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 + ld.d t1, Y, 0 + xvstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + xvstelm.d VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 + xvstelm.d VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 + xvstelm.d VX0, Y, 0, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 + xvstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + xvstelm.d VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 + xvstelm.d VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.d t4, Y, 0 + xvstelm.d VX1, Y, 0, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX0, X, 0 + ld.w t1, Y, 0 + xvstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + xvstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + xvstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + xvstelm.w VX0, Y, 0, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 + xvstelm.w VX0, Y, 0, 4 + add.d Y, Y, INCY + ld.w t2, Y, 0 + xvstelm.w VX0, Y, 0, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 + xvstelm.w VX0, Y, 0, 6 + add.d Y, Y, INCY + ld.w t4, Y, 0 + xvstelm.w VX0, Y, 0, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 + addi.d X, X, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 + ld.d t1, X, 0 + xvstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + xvstelm.d VX2, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 + xvstelm.d VX2, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 + xvstelm.d VX2, X, 0, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 + xvstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + xvstelm.d VX3, X, 0, 1 + add.d X, X, INCX + ld.d t3, X, 0 + xvstelm.d VX3, X, 0, 2 + add.d X, X, INCX + ld.d t4, X, 0 + xvstelm.d VX3, X, 0, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 + ld.w t1, X, 0 + xvstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + xvstelm.w VX2, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + xvstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + xvstelm.w VX2, X, 0, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 + xvstelm.w VX2, X, 0, 4 + add.d X, X, INCX + ld.w t2, X, 0 + xvstelm.w VX2, X, 0, 5 + add.d X, X, INCX + ld.w t3, X, 0 + xvstelm.w VX2, X, 0, 6 + add.d X, X, INCX + ld.w t4, X, 0 + xvstelm.w VX2, X, 0, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD b3, Y, 0 + ST a3, Y, 0 + add.d Y, Y, INCY + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + ST b1, XX, 0 + add.d XX, XX, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD a2, X, 0 + add.d X, X, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD a3, X, 0 + add.d X, X, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + LD b3, Y, 0 + ST a3, Y, 0 + LD a4, X, 0 + add.d X, X, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + ST b1, XX, 0 + add.d XX, XX, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/swap_lsx.S b/kernel/loongarch64/swap_lsx.S new file mode 100644 index 000000000..736187f93 --- /dev/null +++ b/kernel/loongarch64/swap_lsx.S @@ -0,0 +1,431 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +/* INCX==1 and incy==1 */ +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 + vld VX1, X, 16 + vld VX2, Y, 0 + vld VX3, Y, 16 + addi.d I, I, -1 + vst VX2, X, 0 + vst VX3, X, 16 + vst VX0, Y, 0 + vst VX1, Y, 16 +#ifdef DOUBLE + vld VX0, X, 32 + vld VX1, X, 48 + vld VX2, Y, 32 + vld VX3, Y, 48 + vst VX2, X, 32 + vst VX3, X, 48 + vst VX0, Y, 32 + vst VX1, Y, 48 +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: +#ifdef DOUBLE + fld.d $f12, X, 0 + fld.d $f14, Y, 0 + addi.d I, I, -1 + fst.d $f12, Y, 0 + fst.d $f14, X, 0 +#else + fld.s $f12, X, 0 + fld.s $f14, Y, 0 + addi.d I, I, -1 + fst.s $f12, Y, 0 + fst.s $f14, X, 0 +#endif + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +/* INCX==1 and INCY!=1 */ +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 + ld.d t1, Y, 0 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 + vstelm.d VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.d t2, Y, 0 + vstelm.d VX0, Y, 0, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 + vstelm.d VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.d t4, Y, 0 + vstelm.d VX1, Y, 0, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 + ld.w t1, Y, 0 + vstelm.w VX0, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + vstelm.w VX0, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + vstelm.w VX0, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + vstelm.w VX0, Y, 0, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 + vstelm.w VX1, Y, 0, 0 + add.d Y, Y, INCY + ld.w t2, Y, 0 + vstelm.w VX1, Y, 0, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 + vstelm.w VX1, Y, 0, 2 + add.d Y, Y, INCY + ld.w t4, Y, 0 + vstelm.w VX1, Y, 0, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + addi.d X, X, SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +/* INCX!=1 and INCY==1 */ +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 + ld.d t1, X, 0 + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 + vstelm.d VX2, X, 0, 0 + add.d X, X, INCX + ld.d t2, X, 0 + vstelm.d VX2, X, 0, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 + vstelm.d VX3, X, 0, 0 + add.d X, X, INCX + ld.d t4, X, 0 + vstelm.d VX3, X, 0, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE +#else + vld VX2, Y, 0 + ld.w t1, X, 0 + vstelm.w VX2, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + vstelm.w VX2, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + vstelm.w VX2, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + vstelm.w VX2, X, 0, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 + vstelm.w VX3, X, 0, 0 + add.d X, X, INCX + ld.w t2, X, 0 + vstelm.w VX3, X, 0, 1 + add.d X, X, INCX + ld.w t3, X, 0 + vstelm.w VX3, X, 0, 2 + add.d X, X, INCX + ld.w t4, X, 0 + vstelm.w VX3, X, 0, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE + addi.d I, I, -1 + ST $f12, Y, 0 * SIZE + ST $f14, X, 0 * SIZE + add.d X, X, INCX + addi.d Y, Y, SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 + add.d X, X, INCX + LD a2, X, 0 + add.d X, X, INCX + LD a3, X, 0 + add.d X, X, INCX + LD a4, X, 0 + add.d X, X, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD b3, Y, 0 + ST a3, Y, 0 + add.d Y, Y, INCY + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + LD a1, X, 0 + add.d X, X, INCX + ST b1, XX, 0 + add.d XX, XX, INCX + LD b1, Y, 0 + ST a1, Y, 0 + add.d Y, Y, INCY + LD a2, X, 0 + add.d X, X, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + LD b2, Y, 0 + ST a2, Y, 0 + add.d Y, Y, INCY + LD a3, X, 0 + add.d X, X, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + LD b3, Y, 0 + ST a3, Y, 0 + LD a4, X, 0 + add.d X, X, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + LD b4, Y, 0 + ST a4, Y, 0 + add.d Y, Y, INCY + ST b1, XX, 0 + add.d XX, XX, INCX + ST b2, XX, 0 + add.d XX, XX, INCX + ST b3, XX, 0 + add.d XX, XX, INCX + ST b4, XX, 0 + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD $f12, X, 0 + LD $f14, Y, 0 + addi.d I, I, -1 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 154baad454647fdd6d71e2c907285859718da22e Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Wed, 27 Dec 2023 16:04:33 +0800 Subject: [PATCH 514/718] loongarch64: Refine iamin optimization. --- common_loongarch64.h | 10 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- .../{isamin_lasx.S => iamin_lasx.S} | 270 +++++++---- kernel/loongarch64/iamin_lsx.S | 446 ++++++++++++++++++ kernel/loongarch64/idamin_lasx.S | 275 ----------- kernel/loongarch64/idamin_lsx.S | 228 --------- kernel/loongarch64/isamin_lsx.S | 275 ----------- 8 files changed, 649 insertions(+), 863 deletions(-) rename kernel/loongarch64/{isamin_lasx.S => iamin_lasx.S} (54%) create mode 100644 kernel/loongarch64/iamin_lsx.S delete mode 100644 kernel/loongarch64/idamin_lasx.S delete mode 100644 kernel/loongarch64/idamin_lsx.S delete mode 100644 kernel/loongarch64/isamin_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 72e900f77..846fc0dbd 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -119,6 +119,7 @@ static inline int WhereAmI(void){ #define MOV fmov.d #define CMOVT fsel #define MTC movgr2fr.d +#define MTG movfr2gr.d #define FABS fabs.d #define FMIN fmin.d #define FMINA fmina.d @@ -136,6 +137,8 @@ static inline int WhereAmI(void){ #define XVFMINA xvfmina.d #define XVFMAX xvfmax.d #define XVFMAXA xvfmaxa.d +#define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLT xvfcmp.clt.d #define VFSUB vfsub.d #define VFADD vfadd.d @@ -144,6 +147,8 @@ static inline int WhereAmI(void){ #define VFMINA vfmina.d #define VFMAX vfmax.d #define VFMAXA vfmaxa.d +#define VCMPEQ vfcmp.ceq.d +#define VCMPLT vfcmp.clt.d #else @@ -159,6 +164,7 @@ static inline int WhereAmI(void){ #define MOV fmov.s #define CMOVT fsel #define MTC movgr2fr.w +#define MTG movfr2gr.s #define FABS fabs.s #define FMIN fmin.s #define FMINA fmina.s @@ -176,6 +182,8 @@ static inline int WhereAmI(void){ #define XVFMINA xvfmina.s #define XVFMAX xvfmax.s #define XVFMAXA xvfmaxa.s +#define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLT xvfcmp.clt.s #define VFSUB vfsub.s #define VFADD vfadd.s @@ -184,6 +192,8 @@ static inline int WhereAmI(void){ #define VFMINA vfmina.s #define VFMAX vfmax.s #define VFMAXA vfmaxa.s +#define VCMPEQ vfcmp.ceq.s +#define VCMPLT vfcmp.clt.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index cb230b348..4eae2e4f9 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -28,8 +28,8 @@ IDMINKERNEL = idmin_lsx.S ISAMAXKERNEL = isamax_lsx.S IDAMAXKERNEL = idamax_lsx.S -ISAMINKERNEL = isamin_lsx.S -IDAMINKERNEL = idamin_lsx.S +ISAMINKERNEL = iamin_lsx.S +IDAMINKERNEL = iamin_lsx.S SCOPYKERNEL = copy_lsx.S DCOPYKERNEL = copy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index ba59c4566..e7e1b5d5a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -28,8 +28,8 @@ IDMINKERNEL = idmin_lasx.S ISAMAXKERNEL = isamax_lasx.S IDAMAXKERNEL = idamax_lasx.S -ISAMINKERNEL = isamin_lasx.S -IDAMINKERNEL = idamin_lasx.S +ISAMINKERNEL = iamin_lasx.S +IDAMINKERNEL = iamin_lasx.S SCOPYKERNEL = copy_lasx.S DCOPYKERNEL = copy_lasx.S diff --git a/kernel/loongarch64/isamin_lasx.S b/kernel/loongarch64/iamin_lasx.S similarity index 54% rename from kernel/loongarch64/isamin_lasx.S rename to kernel/loongarch64/iamin_lasx.S index cbdf32530..6ea117907 100644 --- a/kernel/loongarch64/isamin_lasx.S +++ b/kernel/loongarch64/iamin_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,25 +128,49 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 +#else addi.d I, I, -1 - xvadd.w VI1, VI1, VINC8 + xvadd.w VI2, VI1, VINC8 xvfmina.s VM1, VX0, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 +#endif + XVCMPEQ VT0, VM0, VM1 addi.d X, X, 8 * SIZE xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 +#else xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 + xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 + xvor.v VX1, VM0, VX1 xvpickve.w VI1, VI0, 0 xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 @@ -105,28 +181,62 @@ xvpickve.w x4, VM0, 3 xvfmina.s VM1, x1, x2 xvfcmp.ceq.s VT0, x1, VM1 +#endif xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 + XVFMINA VM0, x4, x3 + XVCMPEQ VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMINA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX xvinsgr2vr.w VM0, t1, 0 srai.d I, N, 3 @@ -186,9 +296,43 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmina.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmina.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -213,73 +357,42 @@ xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - xvadd.w VI1, VI1, VINC8 + xvadd.w VI2, VI1, VINC8 xvfmina.s VM1, VX0, VM0 xvfcmp.ceq.s VT0, VM1, VM0 +#endif addi.d I, I, -1 xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 blt $r0, I, .L24 .align 3 -.L25: - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfmina.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfmina.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else fmov.s $f16, $f20 .align 3 @@ -306,35 +419,28 @@ xvfmina.s VM0, VM0, VM1 xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L262 xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L262: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L272 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L272: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L282 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L282: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L292 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 @@ -346,9 +452,11 @@ xvbitsel.v VI0, VI0, VI1, VT0 movfr2gr.s i0, $f20 -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 +#endif + +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 srai.d i1, N, 3 slli.d i1, i1, 3 addi.d i1, i1, 1 //current index @@ -357,17 +465,17 @@ .align 3 .L22: - fld.s $f9, X, 0 + LD $f9, X, 0 addi.d I, I, -1 - xvfmina.s VM1, x1, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX + XVFMINA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 + add.d X, X, INCX xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -375,4 +483,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/iamin_lsx.S b/kernel/loongarch64/iamin_lsx.S new file mode 100644 index 000000000..ce885fd88 --- /dev/null +++ b/kernel/loongarch64/iamin_lsx.S @@ -0,0 +1,446 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#ifdef DOUBLE +#define VINC2 $vr17 +#define VINC4 $vr18 +#else +#define VINC4 $vr17 +#define VINC8 $vr18 +#endif +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 +#else + vadd.w VI1, VI1, VINC8 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v x2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 +#endif + VCMPEQ VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + + vreplvei.d VI1, VI0, 0 + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmina.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmina.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + addi.d I, I, -1 + vbitsel.v x2, x4, x2, VT0 + vfmina.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC4 + vfmina.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmina.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 +#endif + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + addi.d I, I, -1 + VFMINA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + add.d X, X, INCX + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d i1, i1, 1 + MTC $f21, i1 + blt $r0, I, .L22 + movfr2gr.s i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamin_lasx.S b/kernel/loongarch64/idamin_lasx.S deleted file mode 100644 index 6ef1e8903..000000000 --- a/kernel/loongarch64/idamin_lasx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfmina.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmina.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmina.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmina.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmina.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d I, I, -1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmina.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmina.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmina.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: // N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - xvfmina.d VM1, x1, VM0 - xvfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/idamin_lsx.S b/kernel/loongarch64/idamin_lsx.S deleted file mode 100644 index 9eb9d883f..000000000 --- a/kernel/loongarch64/idamin_lsx.S +++ /dev/null @@ -1,228 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfmina.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfmina.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmina.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - addi.d I, I, -1 - vbitsel.v x2, x4, x2, VT0 - vfmina.d VM1, VM0, x3 - vfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, x2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmina.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmina.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmina.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - addi.d I, I, -1 - vbitsel.v x2, x4, x2, VT0 - vfmina.d VM1, VM0, x3 - vbitsel.v VM0, VM1, VM0, VT0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VI0, x2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfmina.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - vfmina.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/isamin_lsx.S b/kernel/loongarch64/isamin_lsx.S deleted file mode 100644 index 598888660..000000000 --- a/kernel/loongarch64/isamin_lsx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfmina.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - addi.d I, I, -1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmina.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmina.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmina.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmina.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfmina.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmina.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d I, I, -1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmina.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmina.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmina.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.s $f9, X, 0 - addi.d I, I, -1 - vfmina.s VM1, x1, VM0 - vfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 8be26541933b36c6e3e8002c44002efb02033bdd Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 28 Dec 2023 10:24:24 +0800 Subject: [PATCH 515/718] loongarch64: Refine imax optimization. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- kernel/loongarch64/idmax_lasx.S | 273 ----------- kernel/loongarch64/idmax_lsx.S | 225 --------- .../loongarch64/{ismax_lasx.S => imax_lasx.S} | 230 ++++++++-- kernel/loongarch64/imax_lsx.S | 428 ++++++++++++++++++ kernel/loongarch64/ismax_lsx.S | 272 ----------- 7 files changed, 626 insertions(+), 810 deletions(-) delete mode 100644 kernel/loongarch64/idmax_lasx.S delete mode 100644 kernel/loongarch64/idmax_lsx.S rename kernel/loongarch64/{ismax_lasx.S => imax_lasx.S} (57%) create mode 100644 kernel/loongarch64/imax_lsx.S delete mode 100644 kernel/loongarch64/ismax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 4eae2e4f9..346f1fb45 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -19,8 +19,8 @@ DMAXKERNEL = max_lsx.S SMINKERNEL = min_lsx.S DMINKERNEL = min_lsx.S -ISMAXKERNEL = ismax_lsx.S -IDMAXKERNEL = idmax_lsx.S +ISMAXKERNEL = imax_lsx.S +IDMAXKERNEL = imax_lsx.S ISMINKERNEL = ismin_lsx.S IDMINKERNEL = idmin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index e7e1b5d5a..6b4df2d61 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -19,8 +19,8 @@ DMAXKERNEL = max_lsx.S SMINKERNEL = min_lsx.S DMINKERNEL = min_lsx.S -ISMAXKERNEL = ismax_lasx.S -IDMAXKERNEL = idmax_lasx.S +ISMAXKERNEL = imax_lasx.S +IDMAXKERNEL = imax_lasx.S ISMINKERNEL = ismin_lasx.S IDMINKERNEL = idmin_lasx.S diff --git a/kernel/loongarch64/idmax_lasx.S b/kernel/loongarch64/idmax_lasx.S deleted file mode 100644 index bbfe0941a..000000000 --- a/kernel/loongarch64/idmax_lasx.S +++ /dev/null @@ -1,273 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX0, VX1 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX0, VX1 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f15, $f9 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/idmax_lsx.S b/kernel/loongarch64/idmax_lsx.S deleted file mode 100644 index 1b4734bab..000000000 --- a/kernel/loongarch64/idmax_lsx.S +++ /dev/null @@ -1,225 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - addi.d I, I, -1 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x1, x3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, VM0, x1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX0, VX1 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x1, x3 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, VM0, x1 - addi.d I, I, -1 - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfcmp.clt.d VT0, x1, x2 - vbitsel.v VM0, x1, x2, VT0 - vbitsel.v VI0, VI1, VI2, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f15, $f9 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismax_lasx.S b/kernel/loongarch64/imax_lasx.S similarity index 57% rename from kernel/loongarch64/ismax_lasx.S rename to kernel/loongarch64/imax_lasx.S index 843dd6c6a..2d3d5e9d3 100644 --- a/kernel/loongarch64/ismax_lasx.S +++ b/kernel/loongarch64/imax_lasx.S @@ -1,3 +1,29 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ #define ASSEMBLER #include "common.h" @@ -39,6 +65,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,20 +127,47 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 .L10: xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else xvadd.w VI1, VI1, VINC8 xvfcmp.clt.s VT0, VM0, VX0 addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfcmp.clt.d VT0, x1, x2 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -103,28 +181,33 @@ xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 xvfcmp.clt.s VT0, x1, x2 +#endif xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 + XVCMPLT VT0, x3, x4 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 + XVCMPLT VT0, VM0, VM1 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 - .L20: // INCX!=1 move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE +#else addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -143,11 +226,38 @@ ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX ld.w t2, TEMP, 0 * SIZE +#endif add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE + ld.d t3, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE + ld.d t4, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX +#ifdef DOUBLE + xvinsgr2vr.d VM0, t1, 0 + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else xvinsgr2vr.w VM0, t1, 4 xvinsgr2vr.w VM0, t2, 5 xvinsgr2vr.w VM0, t3, 6 @@ -186,9 +296,46 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI1, VI1, VINC8 + xvadd.d VI2, VI1, VINC4 + xvfcmp.clt.d VT0, VX0, VX1 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + xvfcmp.clt.d VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -218,10 +365,21 @@ addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 +#endif blt $r0, I, .L24 .align 3 .L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -230,57 +388,56 @@ xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x1, x2 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x1, x2 xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 + XVCMPLT VT0, x3, x4 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 + XVCMPLT VT0, VM0, VM1 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else fmov.s $f16, $f20 +#endif .align 3 +#ifndef DOUBLE .L252: xvxor.v VI0, VI0, VI0 xvor.v VI0, VI0, VX0 @@ -343,6 +500,7 @@ fsel $f15, $f15, $f13, $fcc0 fsel $f20, $f20, $f16, $fcc0 movfr2gr.s i0, $f20 +#endif .L21: //N<8 andi I, N, 7 @@ -357,14 +515,14 @@ .L22: fld.d $f9, X, 0 addi.d I, I, -1 - fcmp.clt.s $fcc0, $f15, $f9 + CMPLT $fcc0, $f15, $f9 add.d X, X, INCX fsel $f15, $f15, $f9, $fcc0 fsel $f20, $f20, $f21, $fcc0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -372,4 +530,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/imax_lsx.S b/kernel/loongarch64/imax_lsx.S new file mode 100644 index 000000000..92556d4e6 --- /dev/null +++ b/kernel/loongarch64/imax_lsx.S @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, $vr18 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, $vr17 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, VM0, x1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + vadd.w VI1, VI1, $vr18 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, $vr18 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, $vr17 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x1, x3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, VM0, x1 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, $vr18 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX0, VX1 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfcmp.clt.s VT0, x1, x2 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + vfcmp.clt.s VT0, x3, x4 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + vfcmp.clt.s VT0, VM0, VM1 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + VCMPLT VT0, x1, x2 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + VCMPLT VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + .align 3 + +.L27: +#ifndef DOUBLE + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + VCMPLT VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + VCMPLT VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#endif + MTG i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f15, $f9 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ismax_lsx.S b/kernel/loongarch64/ismax_lsx.S deleted file mode 100644 index 33b326bbd..000000000 --- a/kernel/loongarch64/ismax_lsx.S +++ /dev/null @@ -1,272 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX0, VX1 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x1, x2 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x3, x4 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM0, VM1 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX0, VX1 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM0, VM1 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x1, x2 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x3, x4 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM0, VM1 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.s $fcc0, $f15, $f9 - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - add.d X, X, INCX - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 116aee7527935a51b288a15b7feffc6ea2313e8a Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 28 Dec 2023 15:17:28 +0800 Subject: [PATCH 516/718] loongarch64: Refine imin optimization. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- kernel/loongarch64/idmin_lasx.S | 272 ----------- kernel/loongarch64/idmin_lsx.S | 225 --------- .../loongarch64/{ismin_lasx.S => imin_lasx.S} | 266 ++++++++--- kernel/loongarch64/imin_lsx.S | 428 ++++++++++++++++++ kernel/loongarch64/ismin_lsx.S | 271 ----------- 7 files changed, 645 insertions(+), 825 deletions(-) delete mode 100644 kernel/loongarch64/idmin_lasx.S delete mode 100644 kernel/loongarch64/idmin_lsx.S rename kernel/loongarch64/{ismin_lasx.S => imin_lasx.S} (54%) create mode 100644 kernel/loongarch64/imin_lsx.S delete mode 100644 kernel/loongarch64/ismin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 346f1fb45..9164f28ef 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -22,8 +22,8 @@ DMINKERNEL = min_lsx.S ISMAXKERNEL = imax_lsx.S IDMAXKERNEL = imax_lsx.S -ISMINKERNEL = ismin_lsx.S -IDMINKERNEL = idmin_lsx.S +ISMINKERNEL = imin_lsx.S +IDMINKERNEL = imin_lsx.S ISAMAXKERNEL = isamax_lsx.S IDAMAXKERNEL = idamax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 6b4df2d61..5882b8932 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -22,8 +22,8 @@ DMINKERNEL = min_lsx.S ISMAXKERNEL = imax_lasx.S IDMAXKERNEL = imax_lasx.S -ISMINKERNEL = ismin_lasx.S -IDMINKERNEL = idmin_lasx.S +ISMINKERNEL = imin_lasx.S +IDMINKERNEL = imin_lasx.S ISAMAXKERNEL = isamax_lasx.S IDAMAXKERNEL = idamax_lasx.S diff --git a/kernel/loongarch64/idmin_lasx.S b/kernel/loongarch64/idmin_lasx.S deleted file mode 100644 index 7930d4963..000000000 --- a/kernel/loongarch64/idmin_lasx.S +++ /dev/null @@ -1,272 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX1, VX0 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x2, x1 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x4, x3 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfcmp.clt.d VT0, VX1, VX0 - addi.d I, I, -1 - xvbitsel.v VM1, VX0, VX1, VT0 - xvbitsel.v VI2, VI1, VI2, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfcmp.clt.d VT0, x2, x1 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.d VT0, x4, x3 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.d VT0, VM1, VM0 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f9, $f15 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/idmin_lsx.S b/kernel/loongarch64/idmin_lsx.S deleted file mode 100644 index 8b6edcbf0..000000000 --- a/kernel/loongarch64/idmin_lsx.S +++ /dev/null @@ -1,225 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - addi.d I, I, -1 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x3, x1 - addi.d X, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, x1, VM0 - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - vbitsel.v x1, VX0, VX1, VT0 - vbitsel.v x2, VI1, VI2, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfcmp.clt.d VT0, VX1, VX0 - vbitsel.v x3, VX0, VX1, VT0 - vbitsel.v x4, VI1, VI2, VT0 - vfcmp.clt.d VT0, x3, x1 - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT0 - vfcmp.clt.d VT0, x1, VM0 - addi.d I, I, -1 - vbitsel.v VM0, VM0, x1, VT0 - vbitsel.v VI0, VI0, x2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfcmp.clt.d VT0, x2, x1 - vbitsel.v VM0, x1, x2, VT0 - vbitsel.v VI0, VI1, VI2, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - fcmp.clt.d $fcc0, $f9, $f15 - add.d X, X, INCX - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ismin_lasx.S b/kernel/loongarch64/imin_lasx.S similarity index 54% rename from kernel/loongarch64/ismin_lasx.S rename to kernel/loongarch64/imin_lasx.S index 15f6e2ec9..5306828e2 100644 --- a/kernel/loongarch64/ismin_lasx.S +++ b/kernel/loongarch64/imin_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,20 +128,45 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + XVCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + XVCMPLT VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else xvadd.w VI1, VI1, VINC8 - xvfcmp.clt.s VT0, VX0, VM0 + XVCMPLT VT0, VX0, VM0 addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -98,32 +175,67 @@ xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x2, x1 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x2, x1 xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x4, x3 + XVCMPLT VT0, x4, x3 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM1, VM0 + XVCMPLT VT0, VM1, VM0 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, x1, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -185,9 +297,46 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI1, VI1, VINC8 + xvadd.d VI2, VI1, VINC4 + XVCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + xvbitsel.v VM1, VX0, VX1, VT0 + xvbitsel.v VI2, VI1, VI2, VT0 + XVCMPLT VT0, VM1, VM0 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VI0, VI2, VT0 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -213,73 +362,83 @@ xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 xvadd.w VI1, VI1, VINC8 - xvfcmp.clt.s VT0, VX0, VM0 + XVCMPLT VT0, VX0, VM0 addi.d I, I, -1 xvbitsel.v VM0, VM0, VX0, VT0 xvbitsel.v VI0, VI0, VI1, VT0 +#endif blt $r0, I, .L24 .align 3 .L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 + xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 + xvor.v VX1, VM0, VX1 xvpickve.w VI1, VI0, 0 xvpickve.w VI2, VI0, 1 xvpickve.w VI3, VI0, 2 xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x2, x1 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 +#endif + XVCMPLT VT0, x2, x1 xvbitsel.v VM1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x4, x3 + XVCMPLT VT0, x4, x3 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM1, VM0 + XVCMPLT VT0, VM1, VM0 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + MTG i0, $f20 +#else fmov.s $f16, $f20 +#endif .align 3 +#ifndef DOUBLE .L252: xvxor.v VI0, VI0, VI0 xvor.v VI0, VI0, VX0 @@ -294,13 +453,13 @@ xvpickve.w x2, VM0, 5 xvpickve.w x3, VM0, 6 xvpickve.w x4, VM0, 7 - xvfcmp.clt.s VT0, x2, x1 + XVCMPLT VT0, x2, x1 xvbitsel.v x1, x1, x2, VT0 xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x4, x3 + XVCMPLT VT0, x4, x3 xvbitsel.v VM0, x3, x4, VT0 xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, x1, VM0 + XVCMPLT VT0, x1, VM0 xvbitsel.v VM0, VM0, x1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 li.d TEMP, 1 //处理尾数相等时取最小序号 @@ -309,7 +468,7 @@ xvfcmp.ceq.s VT0, VM0, x1 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L262 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 @@ -317,7 +476,7 @@ xvfcmp.ceq.s VT0, VM0, x2 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L272 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 @@ -325,7 +484,7 @@ xvfcmp.ceq.s VT0, VM0, x3 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L282 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 @@ -333,17 +492,18 @@ xvfcmp.ceq.s VT0, VM0, x4 fcmp.ceq.s $fcc0, $f23, $f17 bceqz $fcc0, .L292 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L292: - fcmp.clt.s $fcc0, $f13, $f15 + CMPLT $fcc0, $f13, $f15 fsel $f15, $f15, $f13, $fcc0 fsel $f20, $f20, $f16, $fcc0 - movfr2gr.s i0, $f20 + MTG i0, $f20 +#endif -.L21: //N<8 +.L21: //N<8 andi I, N, 7 bge $r0, I, .L999 srai.d i1, N, 3 @@ -356,14 +516,14 @@ .L22: fld.d $f9, X, 0 addi.d I, I, -1 - fcmp.clt.s $fcc0, $f9, $f15 + CMPLT $fcc0, $f9, $f15 + add.d X, X, INCX fsel $f15, $f15, $f9, $fcc0 fsel $f20, $f20, $f21, $fcc0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 - add.d X, X, INCX blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -371,4 +531,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/imin_lsx.S b/kernel/loongarch64/imin_lsx.S new file mode 100644 index 000000000..a0c411e7a --- /dev/null +++ b/kernel/loongarch64/imin_lsx.S @@ -0,0 +1,428 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, $vr18 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, $vr17 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x3, x1 + addi.d X, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, x1, VM0 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + vadd.w VI1, VI1, $vr18 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM1, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 + move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d $vr17, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d $vr18, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w $vr17, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w $vr18, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, $vr18 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, $vr17 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + VCMPLT VT0, x3, x1 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT0 + VCMPLT VT0, x1, VM0 + addi.d I, I, -1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, $vr18 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, $vr17 + VCMPLT VT0, VX1, VX0 + addi.d I, I, -1 + vbitsel.v VM1, VX0, VX1, VT0 + vbitsel.v VI2, VI1, VI2, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + VCMPLT VT0, x2, x1 + vbitsel.v VM1, x1, x2, VT0 + vbitsel.v $vr17, VI1, VI2, VT0 + VCMPLT VT0, x4, x3 + vbitsel.v VM0, x3, x4, VT0 + vbitsel.v $vr18, VI3, VI4, VT0 + VCMPLT VT0, VM1, VM0 + vbitsel.v VM0, VM0, VM1, VT0 + vbitsel.v VI0, $vr18, $vr17, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + VCMPLT VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + VCMPLT VT0, x2, x1 + vbitsel.v VM0, x1, x2, VT0 + vbitsel.v VI0, VI1, VI2, VT0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + VCMPLT VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 +#endif + .align 3 + +.L27: +#ifndef DOUBLE + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + VCMPLT VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + VCMPLT VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#endif + MTG i0, $f20 + .align 3 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + fld.d $f9, X, 0 + addi.d I, I, -1 + CMPLT $fcc0, $f9, $f15 + add.d X, X, INCX + fsel $f15, $f15, $f9, $fcc0 + fsel $f20, $f20, $f21, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ismin_lsx.S b/kernel/loongarch64/ismin_lsx.S deleted file mode 100644 index f90ebbd57..000000000 --- a/kernel/loongarch64/ismin_lsx.S +++ /dev/null @@ -1,271 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX1, VX0 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM1, VM0 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x2, x1 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x4, x3 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM1, VM0 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, x1, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfcmp.clt.s VT0, VX1, VX0 - addi.d I, I, -1 - vbitsel.v VM1, VX0, VX1, VT0 - vbitsel.v VI2, VI1, VI2, VT0 - vfcmp.clt.s VT0, VM1, VM0 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VI0, VI2, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfcmp.clt.s VT0, x2, x1 - vbitsel.v VM1, x1, x2, VT0 - vbitsel.v VINC4, VI1, VI2, VT0 - vfcmp.clt.s VT0, x4, x3 - vbitsel.v VM0, x3, x4, VT0 - vbitsel.v VINC8, VI3, VI4, VT0 - vfcmp.clt.s VT0, VM1, VM0 - vbitsel.v VM0, VM0, VM1, VT0 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, x1, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, x2, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, x3, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, x4, VM0 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - fcmp.clt.s $fcc0, $f9, $f15 - fsel $f15, $f15, $f9, $fcc0 - fsel $f20, $f20, $f21, $fcc0 - addi.d I, I, -1 - addi.d i1, i1, 1 - add.d X, X, INCX - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From ea70e165c71201e46961c479e53c17d4034290f8 Mon Sep 17 00:00:00 2001 From: zhoupeng Date: Thu, 28 Dec 2023 20:07:59 +0800 Subject: [PATCH 517/718] loongarch64: Refine rot optimization. --- common_loongarch64.h | 10 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- kernel/loongarch64/drot_lsx.S | 1050 ---------- .../loongarch64/{drot_lasx.S => rot_lasx.S} | 779 ++++++- kernel/loongarch64/rot_lsx.S | 1791 +++++++++++++++++ kernel/loongarch64/srot_lasx.S | 863 -------- kernel/loongarch64/srot_lsx.S | 927 --------- 8 files changed, 2532 insertions(+), 2896 deletions(-) delete mode 100644 kernel/loongarch64/drot_lsx.S rename kernel/loongarch64/{drot_lasx.S => rot_lasx.S} (52%) create mode 100644 kernel/loongarch64/rot_lsx.S delete mode 100644 kernel/loongarch64/srot_lasx.S delete mode 100644 kernel/loongarch64/srot_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 846fc0dbd..13514d6e0 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -129,6 +129,7 @@ static inline int WhereAmI(void){ #define CMPLE fcmp.cle.d #define CMPLT fcmp.clt.d #define NEG fneg.d +#define FFINT ffint.d.l #define XVFSUB xvfsub.d #define XVFADD xvfadd.d @@ -139,6 +140,8 @@ static inline int WhereAmI(void){ #define XVFMAXA xvfmaxa.d #define XVCMPEQ xvfcmp.ceq.d #define XVCMPLT xvfcmp.clt.d +#define XVMUL xvfmul.d +#define XVMSUB xvfmsub.d #define VFSUB vfsub.d #define VFADD vfadd.d @@ -149,6 +152,8 @@ static inline int WhereAmI(void){ #define VFMAXA vfmaxa.d #define VCMPEQ vfcmp.ceq.d #define VCMPLT vfcmp.clt.d +#define VMUL vfmul.d +#define VMSUB vfmsub.d #else @@ -174,6 +179,7 @@ static inline int WhereAmI(void){ #define CMPLE fcmp.cle.s #define CMPLT fcmp.clt.s #define NEG fneg.s +#define FFINT ffint.s.l #define XVFSUB xvfsub.s #define XVFADD xvfadd.s @@ -184,6 +190,8 @@ static inline int WhereAmI(void){ #define XVFMAXA xvfmaxa.s #define XVCMPEQ xvfcmp.ceq.s #define XVCMPLT xvfcmp.clt.s +#define XVMUL xvfmul.s +#define XVMSUB xvfmsub.s #define VFSUB vfsub.s #define VFADD vfadd.s @@ -194,6 +202,8 @@ static inline int WhereAmI(void){ #define VFMAXA vfmaxa.s #define VCMPEQ vfcmp.ceq.s #define VCMPLT vfcmp.clt.s +#define VMUL vfmul.s +#define VMSUB vfmsub.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 9164f28ef..b315c81f2 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -49,8 +49,8 @@ DSUMKERNEL = sum_lsx.S SASUMKERNEL = sasum_lsx.S DASUMKERNEL = dasum_lsx.S -SROTKERNEL = srot_lsx.S -DROTKERNEL = drot_lsx.S +SROTKERNEL = rot_lsx.S +DROTKERNEL = rot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 5882b8932..577f6316e 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -49,8 +49,8 @@ DSUMKERNEL = sum_lasx.S SASUMKERNEL = sasum_lasx.S DASUMKERNEL = dasum_lasx.S -SROTKERNEL = srot_lasx.S -DROTKERNEL = drot_lasx.S +SROTKERNEL = rot_lasx.S +DROTKERNEL = rot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S diff --git a/kernel/loongarch64/drot_lsx.S b/kernel/loongarch64/drot_lsx.S deleted file mode 100644 index 6db803b1c..000000000 --- a/kernel/loongarch64/drot_lsx.S +++ /dev/null @@ -1,1050 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define C $f0 -#define S $f1 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VT0 $vr10 -#define VT1 $vr18 -#define VXC $vr23 -#define VXS $vr9 -#define VXZ $vr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.d.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, C - vreplgr2vr.d VXC, t1 - movfr2gr.d t2, S - vreplgr2vr.d VXS, t2 - movfr2gr.d t3, a1 - vreplgr2vr.d VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 - -.L111: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // C!=0 S==0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, Y, 0 * SIZE - vst VXZ, X, 2 * SIZE - vst VXZ, Y, 2 * SIZE - vst VXZ, X, 4 * SIZE - vst VXZ, Y, 4 * SIZE - vst VXZ, X, 6 * SIZE - vst VXZ, Y, 6 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - -.L121: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // C==0 S!=0 - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, X, 4 * SIZE - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VXS, VX0 - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VXS, VX0 - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX1, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VXC, VX0 - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXS - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXS - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - vst VT1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VXS, VX2 - vfmul.d VT1, VXS, VX0 - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VXS, VX2 - vfmul.d VT1, VXS, VX0 - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX1, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 6 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.d $fcc0, C, a1 - bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - fcmp.ceq.d $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmadd.d VT0, VX2, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmadd.d VT0, VX3, VXS, VT0 - vfmul.d VT1, VX0, VXS - vfmsub.d VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // C!=0 S==0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX0, VXC - vfmul.d VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX1, VXC - vfmul.d VT1, VX3, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX2, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - vfmul.d VT0, VX3, VXS - vfmul.d VT1, VX0, VXS - vfsub.d VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE - fmul.d $f10, $f12, C - fmadd.d $f10, $f13, S, $f10 - fst.d $f10, X, 0 * SIZE - addi.d I, I, -1 - fmul.d $f20, $f12, S - fmsub.d $f20, $f13, C, $f20 - fst.d $f20, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/drot_lasx.S b/kernel/loongarch64/rot_lasx.S similarity index 52% rename from kernel/loongarch64/drot_lasx.S rename to kernel/loongarch64/rot_lasx.S index d3644b780..5d7e3d7cc 100644 --- a/kernel/loongarch64/drot_lasx.S +++ b/kernel/loongarch64/rot_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -33,16 +60,25 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT +#ifdef DOUBLE movfr2gr.d t1, C xvreplgr2vr.d VXC, t1 movfr2gr.d t2, S xvreplgr2vr.d VXS, t2 movfr2gr.d t3, a1 xvreplgr2vr.d VXZ, t3 +#else + movfr2gr.s t1, C + xvreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + xvreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + xvreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -53,15 +89,15 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L112 // C!=0 S==0 b .L111 // C!=0 S!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L114 // C==0 S==0 b .L113 // C==0 S!=0 .align 3 @@ -69,20 +105,24 @@ .L111: // C!=0 S!=0 xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE - xvfmul.d VT0, VX0, VXC - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VX0, VXS - xvfmsub.d VT1, VX2, VXC, VT1 +#endif + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 xvst VT0, X, 0 * SIZE xvst VT1, Y, 0 * SIZE - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 +#ifdef DOUBLE + XVMUL VT0, VX1, VXC + XVFMADD VT0, VX3, VXS, VT0 + XVMUL VT1, VX1, VXS + XVMSUB VT1, VX3, VXC, VT1 xvst VT0, X, 4 * SIZE xvst VT1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -93,16 +133,20 @@ .L112: // C!=0 S==0 xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE - xvfmul.d VT0, VX0, VXC - xvfmul.d VT1, VX2, VXC +#endif + XVMUL VT0, VX0, VXC + XVMUL VT1, VX2, VXC xvst VT0, X, 0 * SIZE xvst VT1, Y, 0 * SIZE - xvfmul.d VT0, VX1, VXC - xvfmul.d VT1, VX3, VXC +#ifdef DOUBLE + XVMUL VT0, VX1, VXC + XVMUL VT1, VX3, VXC xvst VT0, X, 4 * SIZE xvst VT1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -113,18 +157,22 @@ .L113: // C==0 S!=0 xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE - xvfmul.d VT0, VX2, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 +#endif + XVMUL VT0, VX2, VXS + XVMUL VT1, VX0, VXS + XVFSUB VT1, VXZ, VT1 xvst VT0, X, 0 * SIZE xvst VT1, Y, 0 * SIZE - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX1, VXS +#ifdef DOUBLE + XVMUL VT0, VX3, VXS + XVMUL VT1, VX1, VXS xvfsub.d VT1, VXZ, VT1 xvst VT0, X, 4 * SIZE xvst VT1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -135,8 +183,10 @@ .L114: // C==0 S==0 xvst VXZ, X, 0 * SIZE xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, X, 4 * SIZE xvst VXZ, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -148,37 +198,66 @@ bge $r0, I, .L997 move YY, Y move XX, X - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L122 // C!=0 S==0 b .L121 // C!=0 S!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L124 // C==0 S==0 b .L123 // C==0 S!=0 .align 3 .L121: // C!=0 S!=0 xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE + ld.d t4, Y, 0 * SIZE xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VX0, VXS - xvfmsub.d VT1, VX2, VXC, VT1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY +#endif + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 + +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvst VT0, X, 0 * SIZE xvstelm.d VT1, YY, 0, 0 @@ -201,11 +280,10 @@ xvinsgr2vr.d VX3, t3, 2 xvinsgr2vr.d VX3, t4, 3 add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 - addi.d I, I, -1 + XVMUL VT0, VX1, VXC + XVFMADD VT0, VX3, VXS, VT0 + XVMUL VT1, VX1, VXS + XVMSUB VT1, VX3, VXC, VT1 xvst VT0, X, 4 * SIZE xvstelm.d VT1, YY, 0, 0 add.d YY, YY, INCY @@ -214,13 +292,34 @@ xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 +#else + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L121 b .L997 .align 3 .L122: // C!=0 S==0 +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -269,13 +368,60 @@ xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L122 b .L997 .align 3 .L123: // C==0 S!=0 +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -326,14 +472,63 @@ xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvst VT0, X, 0 * SIZE + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L123 b .L997 .align 3 .L124: // C==0 S==0 xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 0 * SIZE xvst VXZ, X, 4 * SIZE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY @@ -350,29 +545,50 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvst VXZ, X, 0 * SIZE + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 + addi.d X, X, 8 * SIZE blt $r0, I, .L124 + move Y, YY b .L997 .align 3 .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 move XX, X - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L212 // C!=0 S==0 b .L211 // C!=0 S!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L214 // C==0 S==0 b .L213 // C==0 S!=0 .align 3 .L211: // C!=0 S!=0 +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -425,6 +641,54 @@ xvstelm.d VT0, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L211 @@ -432,6 +696,7 @@ .align 3 .L212: // C!=0 S==0 +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -480,6 +745,52 @@ add.d XX, XX, INCX xvfmul.d VT1, VX3, VXS xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXC, VX0 + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L212 @@ -487,6 +798,7 @@ .align 3 .L213: // C==0 S!=0 +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -537,6 +849,53 @@ xvstelm.d VT0, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VT0, VXS, VX2 + xvfmul.s VT1, VXS, VX0 + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L213 @@ -544,6 +903,7 @@ .align 3 .L214: // C==0 S==0 +#ifdef DOUBLE xvstelm.d VXZ, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VXZ, XX, 0, 1 @@ -562,6 +922,25 @@ xvstelm.d VXZ, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 4 * SIZE +#else + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 0 * SIZE + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L211 @@ -572,20 +951,21 @@ bge $r0, I, .L997 move YY, Y move XX, X - fcmp.ceq.d $fcc0, C, a1 + CMPEQ $fcc0, C, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L222 // C!=0 S==0 b .L221 // C!=0 S!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, S, a1 + CMPEQ $fcc0, S, a1 bcnez $fcc0, .L224 // C==0 S==0 b .L223 // C==0 S!=0 .align 3 .L221: // C!=0 S!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -674,12 +1054,99 @@ add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmadd.s VT0, VX2, VXS, VT0 + xvfmul.s VT1, VX0, VXS + xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif addi.d I, I, -1 blt $r0, I, .L221 b .L997 .align 3 .L222: // C!=0 S==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -764,12 +1231,97 @@ add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX0, VXC + xvfmul.s VT1, VX2, VXC + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif addi.d I, I, -1 blt $r0, I, .L222 b .L997 .align 3 .L223: // C==0 S!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -856,12 +1408,98 @@ add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VT0, VX2, VXS + xvfmul.s VT1, VX0, VXS + xvfsub.s VT1, VXZ, VT1 + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VT1, YY, 0, 7 + add.d YY, YY, INCY +#endif addi.d I, I, -1 blt $r0, I, .L223 b .L997 .align 3 .L224: // C==0 S==0 +#ifdef DOUBLE xvstelm.d VXZ, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VXZ, XX, 0, 1 @@ -893,9 +1531,46 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 7 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 +#ifdef DOUBLE + move X, XX + move Y, YY +#endif b .L997 .align 3 @@ -905,15 +1580,15 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE - fmul.d $f10, $f12, C - fmadd.d $f10, $f13, S, $f10 - fst.d $f10, X, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + MUL $f10, $f12, C + MADD $f10, $f13, S, $f10 + ST $f10, X, 0 * SIZE addi.d I, I, -1 - fmul.d $f20, $f12, S - fmsub.d $f20, $f13, C, $f20 - fst.d $f20, Y, 0 * SIZE + MUL $f20, $f12, S + MSUB $f20, $f13, C, $f20 + ST $f20, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 @@ -924,4 +1599,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S new file mode 100644 index 000000000..4b0e59310 --- /dev/null +++ b/kernel/loongarch64/rot_lsx.S @@ -0,0 +1,1791 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr19 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, C + vreplgr2vr.d VXC, t1 + movfr2gr.d t2, S + vreplgr2vr.d VXS, t2 + movfr2gr.d t3, a1 + vreplgr2vr.d VXZ, t3 +#else + movfr2gr.s t1, C + vreplgr2vr.w VXC, t1 + movfr2gr.s t2, S + vreplgr2vr.w VXS, t2 + movfr2gr.s t3, a1 + vreplgr2vr.w VXZ, t3 +#endif + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE +#endif + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE + vst VT1, Y, 0 * SIZE + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 +#ifdef DOUBLE + vst VT0, X, 2 * SIZE + vst VT1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 +#endif + vst VT0, X, 4 * SIZE + vst VT1, Y, 4 * SIZE +#ifdef DOUBLE + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vst VT1, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE +#endif + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE +#ifdef DOUBLE + vst VXZ, X, 6 * SIZE + vst VXZ, Y, 6 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L120 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L122 // C!=0 S==0 + b .L121 // C!=0 S!=0 + .align 3 + +.L120: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L124 // C==0 S==0 + b .L123 // C==0 S!=0 + .align 3 + +.L121: // C!=0 S!=0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE +#endif + add.d Y, Y, INCY +#ifndef DOUBLE + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY +#endif + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + b .L997 + .align 3 + +.L122: // C!=0 S==0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L122 + b .L997 + .align 3 + +.L123: // C==0 S!=0 +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 +#else + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 +#endif + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 0 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX0, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 2 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vst VT0, X, 4 * SIZE + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L123 + b .L997 + .align 3 + +.L124: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L124 +#ifdef DOUBLE + move Y, YY +#endif + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L210 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L212 // C!=0 S==0 + b .L211 // C!=0 S!=0 + .align 3 + +.L210: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L214 // C==0 S==0 + b .L213 // C==0 S!=0 + .align 3 + +.L211: // C!=0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + b .L997 + .align 3 + +.L212: // C!=0 S==0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VMUL VT1, VX2, VXC + +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXS + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L212 + b .L997 + .align 3 + +.L213: // C==0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXS, VX2 + VMUL VT1, VXS, VX0 + VFSUB VT1, VXZ, VT1 + +#ifdef DOUBLE + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + add.d X, X, INCX + VMUL VT0, VXS, VX2 + VMUL VT1, VXS, VX0 + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 6 * SIZE +#else + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + VMUL VT0, VX3, VXS + VMUL VT1, VX1, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L213 + b .L997 + .align 3 + +.L214: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 +#else + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vst VT1, Y, 0 * SIZE + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 +#endif + add.d XX, XX, INCX + vst VT1, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 +#ifdef DOUBLE + move X, XX +#endif + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX3, VXC, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY +#endif + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#ifndef DOUBLE + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX0, VXC + VMUL VT1, VX2, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VMUL VT1, VX3, VXC + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX +#endif +#ifdef DOUBLE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX2, t3, 0 + vinsgr2vr.d VX2, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE + vinsgr2vr.d VX3, t1, 0 + vinsgr2vr.d VX3, t2, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX2, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + VMUL VT0, VX3, VXS + VMUL VT1, VX0, VXS + VFSUB VT1, VXZ, VT1 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VT1, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 3 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 +#ifdef DOUBLE + move X, XX + move Y, YY +#endif + b .L997 + .align 3 + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE + MUL $f10, $f12, C + MADD $f10, $f13, S, $f10 + ST $f10, X, 0 * SIZE + addi.d I, I, -1 + MUL $f20, $f12, S + MSUB $f20, $f13, C, $f20 + ST $f20, Y, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/srot_lasx.S b/kernel/loongarch64/srot_lasx.S deleted file mode 100644 index 9aeb4dcf5..000000000 --- a/kernel/loongarch64/srot_lasx.S +++ /dev/null @@ -1,863 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define C $f0 -#define S $f1 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define VX0 $xr8 -#define VX1 $xr20 -#define VX2 $xr21 -#define VX3 $xr22 -#define VT0 $xr10 -#define VT1 $xr18 -#define VXC $xr23 -#define VXS $xr9 -#define VXZ $xr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, C - xvreplgr2vr.w VXC, t1 - movfr2gr.s t2, S - xvreplgr2vr.w VXS, t2 - movfr2gr.s t3, a1 - xvreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 - -.L111: // C!=0 S!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // C!=0 S==0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - xvst VXZ, X, 0 * SIZE - xvst VXZ, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - -.L121: // C!=0 S!=0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // C==0 S!=0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // C==0 S==0 - xvst VXZ, X, 0 * SIZE - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXS, VX2 - xvfmul.s VT1, VXS, VX0 - xvfsub.s VT1, VXZ, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // C!=0 S==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - fmul.s $f10, $f12, C - fmadd.s $f10, $f13, S, $f10 - fst.s $f10, X, 0 * SIZE - addi.d I, I, -1 - fmul.s $f20, $f12, S - fmsub.s $f20, $f13, C, $f20 - fst.s $f20, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/srot_lsx.S b/kernel/loongarch64/srot_lsx.S deleted file mode 100644 index 8822b58e4..000000000 --- a/kernel/loongarch64/srot_lsx.S +++ /dev/null @@ -1,927 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define Y $r7 -#define INCY $r8 -#define C $f0 -#define S $f1 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VT0 $vr10 -#define VT1 $vr18 -#define VXC $vr23 -#define VXS $vr9 -#define VXZ $vr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, C - vreplgr2vr.w VXC, t1 - movfr2gr.s t2, S - vreplgr2vr.w VXS, t2 - movfr2gr.s t3, a1 - vreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 - -.L111: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VT0, VX0, VXC - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX1, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // C!=0 S==0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VT0, VX0, VXC - vfmul.s VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VT0, VX2, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX1, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, Y, 0 * SIZE - vst VXZ, X, 4 * SIZE - vst VXZ, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - -.L121: // C!=0 S!=0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX2, VXC, VT1 - vst VT0, X, 0 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX1, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmul.s VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // C==0 S!=0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX2, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX1, VXS - vfsub.s VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, X, 4 * SIZE - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VXC, VX0 - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VXS, VX0 - vfmsub.s VT1, VX2, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX1, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VXC, VX0 - vfmul.s VT1, VX2, VXC - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXS - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VXS, VX2 - vfmul.s VT1, VXS, VX0 - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX1, VXS - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - fcmp.ceq.s $fcc0, C, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmadd.s VT0, VX2, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX2, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmadd.s VT0, VX3, VXS, VT0 - vfmul.s VT1, VX0, VXS - vfmsub.s VT1, VX3, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // C!=0 S==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX0, VXC - vfmul.s VT1, VX2, VXC - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX1, VXC - vfmul.s VT1, VX3, VXC - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX2, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VT0, VX3, VXS - vfmul.s VT1, VX0, VXS - vfsub.s VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - fmul.s $f10, $f12, C - fmadd.s $f10, $f13, S, $f10 - fst.s $f10, X, 0 * SIZE - addi.d I, I, -1 - fmul.s $f20, $f12, S - fmsub.s $f20, $f13, C, $f20 - fst.s $f20, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 173a65d4e6ddf8bc5a9b1cd02d594d4b48dc9f89 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 25 Dec 2023 15:11:04 +0800 Subject: [PATCH 518/718] loongarch64: Add and refine iamax optimization functions. --- common_loongarch64.h | 4 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 7 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 7 +- kernel/loongarch64/camax_lasx.S | 194 ++++++ kernel/loongarch64/camax_lsx.S | 206 +++++++ .../{isamax_lasx.S => iamax_lasx.S} | 254 ++++++-- kernel/loongarch64/iamax_lsx.S | 482 +++++++++++++++ kernel/loongarch64/icamax_lasx.S | 562 ++++++++++++++++++ kernel/loongarch64/icamax_lsx.S | 434 ++++++++++++++ kernel/loongarch64/idamax_lasx.S | 275 --------- kernel/loongarch64/idamax_lsx.S | 267 --------- kernel/loongarch64/isamax_lsx.S | 275 --------- 12 files changed, 2101 insertions(+), 866 deletions(-) create mode 100644 kernel/loongarch64/camax_lasx.S create mode 100644 kernel/loongarch64/camax_lsx.S rename kernel/loongarch64/{isamax_lasx.S => iamax_lasx.S} (55%) create mode 100644 kernel/loongarch64/iamax_lsx.S create mode 100644 kernel/loongarch64/icamax_lasx.S create mode 100644 kernel/loongarch64/icamax_lsx.S delete mode 100644 kernel/loongarch64/idamax_lasx.S delete mode 100644 kernel/loongarch64/idamax_lsx.S delete mode 100644 kernel/loongarch64/isamax_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 13514d6e0..599b4795c 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -139,6 +139,7 @@ static inline int WhereAmI(void){ #define XVFMAX xvfmax.d #define XVFMAXA xvfmaxa.d #define XVCMPEQ xvfcmp.ceq.d +#define XVCMPLE xvfcmp.cle.d #define XVCMPLT xvfcmp.clt.d #define XVMUL xvfmul.d #define XVMSUB xvfmsub.d @@ -151,6 +152,7 @@ static inline int WhereAmI(void){ #define VFMAX vfmax.d #define VFMAXA vfmaxa.d #define VCMPEQ vfcmp.ceq.d +#define VCMPLE vfcmp.cle.d #define VCMPLT vfcmp.clt.d #define VMUL vfmul.d #define VMSUB vfmsub.d @@ -189,6 +191,7 @@ static inline int WhereAmI(void){ #define XVFMAX xvfmax.s #define XVFMAXA xvfmaxa.s #define XVCMPEQ xvfcmp.ceq.s +#define XVCMPLE xvfcmp.cle.s #define XVCMPLT xvfcmp.clt.s #define XVMUL xvfmul.s #define XVMSUB xvfmsub.s @@ -201,6 +204,7 @@ static inline int WhereAmI(void){ #define VFMAX vfmax.s #define VFMAXA vfmaxa.s #define VCMPEQ vfcmp.ceq.s +#define VCMPLE vfcmp.cle.s #define VCMPLT vfcmp.clt.s #define VMUL vfmul.s #define VMSUB vfmsub.s diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index b315c81f2..a8a6dd82f 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -9,6 +9,7 @@ DSCALKERNEL = dscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S +CAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S @@ -25,8 +26,10 @@ IDMAXKERNEL = imax_lsx.S ISMINKERNEL = imin_lsx.S IDMINKERNEL = imin_lsx.S -ISAMAXKERNEL = isamax_lsx.S -IDAMAXKERNEL = idamax_lsx.S +ISAMAXKERNEL = iamax_lsx.S +IDAMAXKERNEL = iamax_lsx.S +ICAMAXKERNEL = icamax_lsx.S +IZAMAXKERNEL = icamax_lsx.S ISAMINKERNEL = iamin_lsx.S IDAMINKERNEL = iamin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 577f6316e..e4c45e1fa 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -9,6 +9,7 @@ DSCALKERNEL = dscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S +CAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S @@ -25,8 +26,10 @@ IDMAXKERNEL = imax_lasx.S ISMINKERNEL = imin_lasx.S IDMINKERNEL = imin_lasx.S -ISAMAXKERNEL = isamax_lasx.S -IDAMAXKERNEL = idamax_lasx.S +ISAMAXKERNEL = iamax_lasx.S +IDAMAXKERNEL = iamax_lasx.S +ICAMAXKERNEL = icamax_lasx.S +IZAMAXKERNEL = icamax_lasx.S ISAMINKERNEL = iamin_lasx.S IDAMINKERNEL = iamin_lasx.S diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S new file mode 100644 index 000000000..7013430cb --- /dev/null +++ b/kernel/loongarch64/camax_lasx.S @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + xvxor.v VM0, VM0, VM0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmax.s VM1, x1, x2 + xvfmax.s VM0, x3, x4 + xvfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S new file mode 100644 index 000000000..2e55629de --- /dev/null +++ b/kernel/loongarch64/camax_lsx.S @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + vxor.v VM0, VM0, VM0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmax.s VM1, x1, VM1 + vfmax.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmax.s VM1, x1, x2 + vfmax.s VM0, x3, x4 + vfmax.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmax.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmax.s s1, s1, s2 + fmax.s s3, s3, s4 + fmax.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmax.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/isamax_lasx.S b/kernel/loongarch64/iamax_lasx.S similarity index 55% rename from kernel/loongarch64/isamax_lasx.S rename to kernel/loongarch64/iamax_lasx.S index 2800b1d43..090da3004 100644 --- a/kernel/loongarch64/isamax_lasx.S +++ b/kernel/loongarch64/iamax_lasx.S @@ -1,3 +1,30 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + #define ASSEMBLER #include "common.h" @@ -39,6 +66,31 @@ slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 srai.d I, N, 3 bge $r0, I, .L21 @@ -76,9 +128,25 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L10: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvadd.d VI1, VI1, VINC8 + xvld VX1, X, 4 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else xvld VX0, X, 0 * SIZE addi.d I, I, -1 xvadd.w VI1, VI1, VINC8 @@ -87,10 +155,21 @@ addi.d X, X, 8 * SIZE xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 +#endif blt $r0, I, .L10 .align 3 .L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -103,28 +182,62 @@ xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmaxa.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 +#endif + XVFMAXA VM1, x1, x2 + XVCMPEQ VT0, x1, VM1 xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 + XVFMAXA VM0, x3, x4 + XVCMPEQ VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMAXA VM0, VM0, VM1 + XVCMPEQ VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + CMPEQ $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 .align 3 .L20: // INCX!=1 move TEMP, X +#ifdef DOUBLE + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.d t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + xvinsgr2vr.d VM0, t2, 1 + xvinsgr2vr.d VM0, t3, 2 + xvinsgr2vr.d VM0, t4, 3 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + slli.d i0, i0, 1 //8 + xvreplgr2vr.d VINC8, i0 + addi.d i0, i0, -15 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 5 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 //2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 //3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 //4 +#else addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -186,9 +299,46 @@ xvinsgr2vr.w VI0, i0, 6 //7 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 7 //8 +#endif .align 3 .L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvadd.d VI2, VI1, VINC4 + xvfmaxa.d VM1, VX0, VX1 + xvfcmp.ceq.d VT0, VX0, VM1 + addi.d I, I, -1 + xvbitsel.v VI2, VI2, VI1, VT0 + xvfmaxa.d VM1, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VM0, VM1, VM0, VT0 + xvbitsel.v VI0, VI2, VI0, VT0 +#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -219,10 +369,30 @@ addi.d I, I, -1 xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 +#endif blt $r0, I, .L24 .align 3 .L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, x1, VM1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x4, x3 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else xvxor.v VX0, VX0, VX0 xvor.v VX0, VI0, VX0 xvxor.v VX1, VX1, VX1 @@ -245,44 +415,45 @@ xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VM0, VM0, VM1, VT0 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 +#endif + CMPEQ $fcc0, $f15, $f9 bceqz $fcc0, .L26 - xvfcmp.clt.s VT0, VI1, VI0 + XVCMPLT VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L27 - xvfcmp.clt.s VT0, VI2, VI0 + XVCMPLT VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 .L27: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L28 - xvfcmp.clt.s VT0, VI3, VI0 + XVCMPLT VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L28: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L29 - xvfcmp.clt.s VT0, VI4, VI0 + XVCMPLT VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 .L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else fmov.s $f16, $f20 +#endif .align 3 +#ifdef DOUBLE + +#else .L252: xvxor.v VI0, VI0, VI0 xvor.v VI0, VI0, VX0 @@ -306,35 +477,27 @@ xvfmaxa.s VM0, VM0, VM1 xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - xvfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L262 xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 .align 3 .L262: - xvfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f10 bceqz $fcc0, .L272 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 - .L272: - xvfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f11 bceqz $fcc0, .L282 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 .L282: - xvfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 + fcmp.ceq.d $fcc0, $f15, $f12 bceqz $fcc0, .L292 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 @@ -345,8 +508,9 @@ xvfcmp.ceq.s VT0, VM0, VX0 xvbitsel.v VI0, VI0, VI1, VT0 movfr2gr.s i0, $f20 +#endif -.L21: // N<8 +.L21: //N<8 andi I, N, 7 bge $r0, I, .L999 srai.d i1, N, 3 @@ -357,17 +521,17 @@ .align 3 .L22: - fld.s $f9, X, 0 + LD $f9, X, 0 addi.d I, I, -1 - xvfmaxa.s VM1, x1, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 + XVFMAXA VM1, x1, VM0 + XVCMPEQ VT0, VM0, VM1 add.d X, X, INCX xvbitsel.v VM0, VM1, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - movfr2gr.s i0, $f20 + MTG i0, $f20 .align 3 .L999: @@ -375,4 +539,4 @@ jirl $r0, $r1, 0x0 .align 3 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/iamax_lsx.S b/kernel/loongarch64/iamax_lsx.S new file mode 100644 index 000000000..ce5b3c724 --- /dev/null +++ b/kernel/loongarch64/iamax_lsx.S @@ -0,0 +1,482 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC2 $vr17 +#define VINC4 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + bne INCX, TEMP, .L20 + vld VM0, X, 0 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI2, VINC2 + vld VX1, X, 6 * SIZE + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + addi.d I, I, -1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 +#endif + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L16 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L17 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +#ifdef DOUBLE +.L16: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + vfmaxa.d VM1, x1, VM0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: // INCX!=1 + move TEMP, X + addi.d i0, i0, 1 + ld.d t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.d t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.d VM0, t2, 1 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC2, i0 + slli.d i0, i0, 1 //4 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 3 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x1, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x1 + vbitsel.v x2, VI2, VI1, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t2, 1 + vadd.d VI1, VI2, VINC2 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t2, 1 + vadd.d VI2, VI1, VINC2 + vfmaxa.d x3, VX0, VX1 + vfcmp.ceq.d VT0, VX0, x3 + vbitsel.v x4, VI2, VI1, VT0 + vfmaxa.d x3, x1, x3 + vfcmp.ceq.d VT0, x1, x3 + vbitsel.v x2, x4, x2, VT0 + vfmaxa.d VM1, VM0, x3 + vbitsel.v VM0, VM1, VM0, VT0 + vfcmp.ceq.d VT0, VM0, VM1 + vbitsel.v VI0, x2, VI0, VT0 + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 + .align 3 + +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + +#else +.L20: // INCX!=1 + move TEMP, X + addi.w i0, i0, 1 + ld.w t1, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t1, 0 + srai.d I, N, 3 + bge $r0, I, .L21 + ld.w t2, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t3, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + ld.w t4, TEMP, 0 * SIZE + add.d TEMP, TEMP, INCX + vinsgr2vr.w VM0, t2, 1 + vinsgr2vr.w VM0, t3, 2 + vinsgr2vr.w VM0, t4, 3 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC2, i0 + slli.w i0, i0, 1 //8 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -15 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 5 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 + .align 3 + +.L24: + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vadd.w VI2, VI1, VINC2 + vfmaxa.s VM1, VX0, VX1 + vfcmp.ceq.s VT0, VX0, VM1 + vbitsel.v VI2, VI2, VI1, VT0 + vfmaxa.s VM1, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + addi.d I, I, -1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI2, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC2, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC4, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC4, VINC2, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: // N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD $f9, X, 0 + VFMAXA VM1, x1, VM0 + VCMPEQ VT0, VM0, VM1 + vbitsel.v VM0, VM1, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + addi.d I, I, -1 + addi.d i1, i1, 1 + add.d X, X, INCX + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S new file mode 100644 index 000000000..7800cb917 --- /dev/null +++ b/kernel/loongarch64/icamax_lasx.S @@ -0,0 +1,562 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + xvxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMAX x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmax.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmax.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmax.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d I, I, -1 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 + xvfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.s x1, x1, x2 + xvfmax.s x3, VM0, x1 + xvfcmp.ceq.s VT0, x3, VM0 +#endif + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmaxa.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmaxa.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmaxa.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 +#else + fmov.s $f16, $f20 +#endif + .align 3 + +#ifdef DOUBLE +#else +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +#endif +.L21: //N<8 +#ifdef DOUBLE + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S new file mode 100644 index 000000000..a2fc9dbbd --- /dev/null +++ b/kernel/loongarch64/icamax_lsx.S @@ -0,0 +1,434 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + vxor.v VM0, VM0, VM0 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 +#endif + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMAX x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + addi.d I, I, -1 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmax.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VI4, x1 + vfmul.s x4, VI4, x2 + vfcmp.clt.s VT0, x1, VI3 + vfcmp.clt.s VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.s x1, x1, x2 + vfmax.s x3, VM0, x1 + vfcmp.ceq.s VT0, x3, VM0 +#endif + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmaxa.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmaxa.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmaxa.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +#ifdef DOUBLE +.L26: + vfmaxa.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 + vbitsel.v VI0, VI2, VI1, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 +#else +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMAX a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/idamax_lasx.S b/kernel/loongarch64/idamax_lasx.S deleted file mode 100644 index 8248ee757..000000000 --- a/kernel/loongarch64/idamax_lasx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $xr9 -#define x2 $xr10 -#define x3 $xr11 -#define x4 $xr12 -#define VX0 $xr13 -#define VX1 $xr14 -#define VM0 $xr15 -#define VM1 $xr16 -#define VINC4 $xr17 -#define VINC8 $xr18 -#define VI0 $xr20 -#define VI1 $xr21 -#define VI2 $xr22 -#define VI3 $xr8 -#define VI4 $xr19 -#define VT0 $xr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - xvld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L10: - xvld VX0, X, 0 * SIZE - xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 - xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 - xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 - xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - xvfcmp.ceq.d VT0, VM0, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - xvfcmp.clt.d VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - xvfcmp.ceq.d VT0, VM0, x2 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - xvfcmp.clt.d VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - xvfcmp.ceq.d VT0, VM0, x3 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - xvfcmp.clt.d VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - xvfcmp.ceq.d VT0, VM0, x4 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - xvfcmp.clt.d VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.d i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - addi.d I, I, -1 - xvfmaxa.d VM1, x1, VM0 - xvfcmp.ceq.d VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/idamax_lsx.S b/kernel/loongarch64/idamax_lsx.S deleted file mode 100644 index fb2d5bac1..000000000 --- a/kernel/loongarch64/idamax_lsx.S +++ /dev/null @@ -1,267 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC2 $vr17 -#define VINC4 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.d i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L11 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.d VI1, VI1, VINC4 - vld VX1, X, 2 * SIZE - vadd.d VI2, VI1, VINC2 - vfmaxa.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - vld VX0, X, 4 * SIZE - vadd.d VI1, VI2, VINC2 - vld VX1, X, 6 * SIZE - vadd.d VI2, VI1, VINC2 - vfmaxa.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmaxa.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - vbitsel.v x2, x4, x2, VT0 - vfmaxa.d VM1, VM0, x3 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, x2, VI0, VT0 - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L16 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L17 - .align 3 - -.L16: - vfmaxa.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L17: - movfr2gr.d i0, $f20 - .align 3 - -.L11: //INCX==1 and N<8 - andi I, N, 7 - bge $r0, I, .L14 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L13: - fld.d $f9, X, 0 - vfmaxa.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d I, I, -1 - addi.d i1, i1, 1 - addi.d X, X, SIZE - movgr2fr.d $f21, i1 - blt $r0, I, .L13 - movfr2gr.d i0, $f20 - .align 3 - -.L14: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.d i0, i0, 1 - ld.d t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.d t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.d VM0, t2, 1 - slli.d i0, i0, 1 //2 - vreplgr2vr.d VINC2, i0 - slli.d i0, i0, 1 //4 - vreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 - vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 1 - vinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 - .align 3 - -.L24: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI1, VINC4 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmaxa.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t2, 1 - vadd.d VI1, VI2, VINC2 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t2, 1 - vadd.d VI2, VI1, VINC2 - vfmaxa.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmaxa.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - vbitsel.v x2, x4, x2, VT0 - vfmaxa.d VM1, VM0, x3 - vbitsel.v VM0, VM1, VM0, VT0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VI0, x2, VI0, VT0 - addi.d I, I, -1 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.d VI1, VI0, 0 - vreplvei.d VI2, VI0, 1 - vreplvei.d x1, VM0, 0 - vreplvei.d x2, VM0, 1 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.d $f17, TEMP - ffint.d.l $f17, $f17 - vfcmp.ceq.d VT0, x2, x1 - fcmp.ceq.d $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.d VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L27 - .align 3 - -.L26: - vfmaxa.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 - .align 3 - -.L27: - movfr2gr.d i0, $f20 - .align 3 - -.L21: // N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.d $f9, X, 0 - vfmaxa.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d I, I, -1 - addi.d i1, i1, 1 - add.d X, X, INCX - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.d i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/isamax_lsx.S b/kernel/loongarch64/isamax_lsx.S deleted file mode 100644 index a18aa7354..000000000 --- a/kernel/loongarch64/isamax_lsx.S +++ /dev/null @@ -1,275 +0,0 @@ -#define ASSEMBLER - -#include "common.h" - -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r12 -#define t1 $r13 -#define t2 $r15 -#define t3 $r18 -#define t4 $r16 -#define i0 $r17 -#define i1 $r14 -#define TEMP $r19 -#define x1 $vr9 -#define x2 $vr10 -#define x3 $vr11 -#define x4 $vr12 -#define VX0 $vr13 -#define VX1 $vr14 -#define VM0 $vr15 -#define VM1 $vr16 -#define VINC4 $vr17 -#define VINC8 $vr18 -#define VI0 $vr20 -#define VI1 $vr21 -#define VI2 $vr22 -#define VI3 $vr8 -#define VI4 $vr19 -#define VT0 $vr23 - - PROLOGUE - li.d i0, 0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - bne INCX, TEMP, .L20 - vld VM0, X, 0 - addi.w i0, i0, 1 - srai.d I, N, 3 - bge $r0, I, .L21 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L10: - vld VX0, X, 0 * SIZE - vadd.w VI1, VI1, VINC8 - vld VX1, X, 4 * SIZE - vadd.w VI2, VI1, VINC4 - vfmaxa.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - addi.d I, I, -1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmaxa.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L10 - .align 3 - -.L15: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmaxa.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmaxa.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmaxa.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - b .L26 - .align 3 - -.L20: // INCX!=1 - move TEMP, X - addi.w i0, i0, 1 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t1, 0 - srai.d I, N, 3 - bge $r0, I, .L21 - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - vinsgr2vr.w VM0, t2, 1 - vinsgr2vr.w VM0, t3, 2 - vinsgr2vr.w VM0, t4, 3 - slli.w i0, i0, 2 //4 - vreplgr2vr.w VINC4, i0 - slli.w i0, i0, 1 //8 - vreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 - vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 2 - addi.w i0, i0, 1 - vinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 5 - vinsgr2vr.w VI0, i0, 0 //1 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 2 //3 - addi.w i0, i0, 1 - vinsgr2vr.w VI0, i0, 3 //4 - .align 3 - -.L24: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - vadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vadd.w VI2, VI1, VINC4 - vfmaxa.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmaxa.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - addi.d I, I, -1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 - blt $r0, I, .L24 - .align 3 - -.L25: - vreplvei.w VI1, VI0, 0 - vreplvei.w VI2, VI0, 1 - vreplvei.w VI3, VI0, 2 - vreplvei.w VI4, VI0, 3 - vreplvei.w x1, VM0, 0 - vreplvei.w x2, VM0, 1 - vreplvei.w x3, VM0, 2 - vreplvei.w x4, VM0, 3 - vfmaxa.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC4, VI2, VI1, VT0 - vfmaxa.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC8, VI4, VI3, VT0 - vfmaxa.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC8, VINC4, VT0 - li.d TEMP, 1 //处理尾数相等时取最小序号 - movgr2fr.w $f17, TEMP - ffint.s.w $f17, $f17 - vfcmp.ceq.s VT0, VM0, x1 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L26: - vfcmp.ceq.s VT0, VM0, x2 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 - .align 3 - -.L27: - vfcmp.ceq.s VT0, VM0, x3 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - vfcmp.ceq.s VT0, VM0, x4 - fcmp.ceq.s $fcc0, $f23, $f17 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: - movfr2gr.s i0, $f20 - .align 3 - -.L21: //N<8 - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 - addi.d i1, i1, 1 //current index - movgr2fr.d $f21, i1 - movgr2fr.d $f20, i0 - .align 3 - -.L22: - fld.s $f9, X, 0 - addi.d I, I, -1 - vfmaxa.s VM1, x1, VM0 - vfcmp.ceq.s VT0, VM0, VM1 - add.d X, X, INCX - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 - addi.d i1, i1, 1 - movgr2fr.d $f21, i1 - blt $r0, I, .L22 - movfr2gr.s i0, $f20 - .align 3 - -.L999: - move $r4, $r17 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE From 179ed51d3b2df5e0df8a28d184bd169efa7f2b61 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 21 Dec 2023 14:18:39 +0800 Subject: [PATCH 519/718] Add dgemm_kernel_8x4.S file. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 14 + kernel/loongarch64/dgemm_kernel_8x4.S | 2894 ++++++++++++++++++++++ param.h | 4 +- 3 files changed, 2910 insertions(+), 2 deletions(-) create mode 100644 kernel/loongarch64/dgemm_kernel_8x4.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a8a6dd82f..a78c0dbc5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -58,4 +58,18 @@ DROTKERNEL = rot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S +DGEMMKERNEL = dgemm_kernel_8x4.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif diff --git a/kernel/loongarch64/dgemm_kernel_8x4.S b/kernel/loongarch64/dgemm_kernel_8x4.S new file mode 100644 index 000000000..405f1bd97 --- /dev/null +++ b/kernel/loongarch64/dgemm_kernel_8x4.S @@ -0,0 +1,2894 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 /* !! DO NOT USE $r21 and $r22 !! */ +#define T1 $r26 +#define T2 $r27 +#define ZERO $r0 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define D14 $vr30 +#define D15 $vr31 +#define VALPHA $vr15 + +/* Prefetch interval */ +#define A_PRE 0x200 +#define B_PRE 0x100 + +.macro KERNEL2x8x4 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + vldrepl.d U5, B0, 0x08 + vfmadd.d D10, U10, U14, D10 + vfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + vldrepl.d U6, B0, 0x10 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U7, B0, 0x18 + vfmadd.d D14, U10, U15, D14 + vfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x10 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x20 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vld U11, A0, 0x30 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U12, B0, 0x00 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + vldrepl.d U13, B0, 0x08 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + vldrepl.d U14, B0, 0x10 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U15, B0, 0x18 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x8x4_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + vldrepl.d U5, B0, 0x08 + vfmadd.d D10, U10, U14, D10 + vfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + vldrepl.d U6, B0, 0x10 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + vldrepl.d U7, B0, 0x18 + vfmadd.d D14, U10, U15, D14 + vfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 +.endm + +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm + +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + vldrepl.d U12, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U13, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U14, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U15, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U8, U14, D8 + vfmadd.d D9, U9, U14, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U8, U15, D12 + vfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 +.endm + +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm + +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm + +.macro KERNEL2x2x4 + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vfmadd.d D2, U9, U12, D2 + vfmadd.d D3, U9, U13, D3 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x4_END + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vfmadd.d D2, U9, U12, D2 + vfmadd.d D3, U9, U13, D3 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 +.endm + +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm + +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm + +.macro KERNEL2x1x4 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vldrepl.d U8, A0, 0x08 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vld U12, B0, 0x20 + vld U13, B0, 0x30 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x40 +.endm + +.macro KERNEL2x1x4_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U8, U13, D1 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x8x2 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x10 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x20 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vld U11, A0, 0x30 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x2_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vld U3, A0, 0x30 + vfmadd.d D6, U10, U13, D6 + vfmadd.d D7, U11, U13, D7 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 +.endm + +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm + +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 +.endr + KERNEL2x8x2_END +.endm + +.macro KERNEL2x4x2 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + vld U8, A0, 0x20 + vld U9, A0, 0x30 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U12, B0, 0x10 + vldrepl.d U13, B0, 0x18 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x2_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vfmadd.d D4, U8, U13, D4 + vfmadd.d D5, U9, U13, D5 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 +.endm + +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm + +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm + +.macro KERNEL2x2x2 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D4, U8, U13, D4 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + vld U8, A0, 0x10 + vldrepl.d U12, B0, 0x10 + vldrepl.d U13, B0, 0x18 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D4, U0, U5, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x2_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D4, U8, U13, D4 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + + vld U4, B0, 0x00 + vldrepl.d U8, A0, 0x08 + vld U12, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x1x2_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + + vld U4, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x8x1 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vldrepl.d U4, B0, 0x00 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vld U8, A0, 0x40 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vld U9, A0, 0x50 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vld U10, A0, 0x60 + vld U11, A0, 0x70 + + vldrepl.d U12, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x1_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + + vld U1, A0, 0x10 + vfmadd.d D2, U10, U12, D2 + vfmadd.d D3, U11, U12, D3 + + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 +.endm + +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm + +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm + +.macro KERNEL2x4x1 + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vldrepl.d U4, B0, 0x00 + + vld U8, A0, 0x20 + vld U9, A0, 0x30 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vldrepl.d U12, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x4x1_END + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vfmadd.d D0, U8, U12, D0 + vfmadd.d D1, U9, U12, D1 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm + +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm + +.macro KERNEL2x2x1 + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + vld U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x2x1_END + vld U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm + +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm + +.macro KERNEL2x1x1 + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + vldrepl.d U8, A0, 0x00 + vfmadd.d D0, U0, U4, D0 + vldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x1x1_END + vldrepl.d U0, A0, 0x00 + vfmadd.d D0, U8, U12, D0 + vldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + vfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm + +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm + + + PROLOGUE + + addi.d $sp, $sp, -112 + /* Store regs */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f24, $sp, 40 + ST $f25, $sp, 48 + ST $f26, $sp, 56 + ST $f27, $sp, 64 + ST $f28, $sp, 72 + ST $f29, $sp, 80 + ST $f30, $sp, 88 + ST $f31, $sp, 96 + ST ALPHA, $sp, 104 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + + /* if (!(N >> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + vldrepl.d VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */ + beq ZERO, J, .L_N3 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + preld 0, C0, 0x20 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + preld 0, C1, 0x20 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + vldrepl.d U6, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + preld 0, C2, 0x20 + vfmul.d D10, U2, U6 + vfmul.d D11, U3, U6 + + vldrepl.d U7, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + preld 0, C3, 0x20 + vfmul.d D14, U2, U7 + vfmul.d D15, U3, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_TL1_END +.L_TL1: /* TL-- */ + KERNEL8x8x4 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + +.L_TL1_END: + KERNEL8x8x4_END + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + /* Cumulative D0~D15 */ + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + vfmadd.d D10, U2, U6, D10 + vfmadd.d D11, U3, U6, D11 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + vfmadd.d D14, U2, U7, D14 + vfmadd.d D15, U3, U7, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D10, D10, VALPHA + vfmul.d D11, D11, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA + vfmul.d D14, D14, VALPHA + vfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 + + /* Load C2 */ + vld U8, C2, 0x00 + vld U9, C2, 0x10 + vld U10, C2, 0x20 + vld U11, C2, 0x30 + vfmadd.d D8, D8, VALPHA, U8 + vfmadd.d D9, D9, VALPHA, U9 + vfmadd.d D10, D10, VALPHA, U10 + vfmadd.d D11, D11, VALPHA, U11 + + /* Load C3 */ + vld U0, C3, 0x00 + vld U1, C3, 0x10 + vld U2, C3, 0x20 + vld U3, C3, 0x30 + vfmadd.d D12, D12, VALPHA, U0 + vfmadd.d D13, D13, VALPHA, U1 + vfmadd.d D14, D14, VALPHA, U2 + vfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + vst D10, C2, 0x20 + vst D11, C2, 0x30 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + vst D14, C3, 0x20 + vst D15, C3, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_M0 + + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + vldrepl.d U6, B0, 0x10 + /* line 3 */ + vfmul.d D8, U0, U6 + vfmul.d D9, U1, U6 + + vldrepl.d U7, B0, 0x18 + /* line 4 */ + vfmul.d D12, U0, U7 + vfmul.d D13, U1, U7 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + vldrepl.d U14, B0, 0x10 + vldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M4_TL1_END + +.L_M4_TL1: /* TL-- */ + KERNEL8x4x4 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 + +.L_M4_TL1_END: + KERNEL8x4x4_END + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + vldrepl.d U6, B0, 0x10 + vfmadd.d D8, U0, U6, D8 + vfmadd.d D9, U1, U6, D9 + + vldrepl.d U7, B0, 0x18 + vfmadd.d D12, U0, U7, D12 + vfmadd.d D13, U1, U7, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D8, D8, VALPHA + vfmul.d D9, D9, VALPHA + vfmul.d D12, D12, VALPHA + vfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 + + /* Load C2 */ + vld U4, C2, 0x00 + vld U5, C2, 0x10 + vfmadd.d D8, D8, VALPHA, U4 + vfmadd.d D9, D9, VALPHA, U5 + + /* Load C3 */ + vld U6, C3, 0x00 + vld U7, C3, 0x10 + vfmadd.d D12, D12, VALPHA, U6 + vfmadd.d D13, D13, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + /* Store C2 */ + vst D8, C2, 0x00 + vst D9, C2, 0x10 + /* Store C3 */ + vst D12, C3, 0x00 + vst D13, C3, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + vfmul.d D2, U1, U4 + vfmul.d D3, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + + vldrepl.d U8, A0, 0x00 + vldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M2_TL1_END +.L_M2_TL1: /* TL-- */ + KERNEL8x2x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + vldrepl.d U0, A0, 0x00 + vldrepl.d U1, A0, 0x08 + + vld U4, B0, 0x00 + vld U5, B0, 0x10 + + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + vfmadd.d D2, U1, U4, D2 + vfmadd.d D3, U1, U5, D3 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 + vstelm.d D2, C0, 0x08, 0x00 + vstelm.d D2, C1, 0x08, 0x01 + vstelm.d D3, C2, 0x08, 0x00 + vstelm.d D3, C3, 0x08, 0x01 +#else + /* Load C0 */ + vld U0, C0, 0x00 + /* Load C1 */ + vld U1, C1, 0x00 + /* Load C2 */ + vld U2, C2, 0x00 + /* Load C3 */ + vld U3, C3, 0x00 + + vilvl.d D4, D2, D0 //C0 + vilvh.d D5, D2, D0 //C1 + vilvl.d D6, D3, D1 //C2 + vilvh.d D7, D3, D1 //C3 + + vfmadd.d D0, D4, VALPHA, U0 + vfmadd.d D2, D5, VALPHA, U1 + vfmadd.d D1, D6, VALPHA, U2 + vfmadd.d D3, D7, VALPHA, U3 + + vst D0, C0, 0x00 + vst D2, C1, 0x00 + vst D1, C2, 0x00 + vst D3, C3, 0x00 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmul.d D0, U0, U4 + vfmul.d D1, U0, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + vld U12, B0, 0x00 + vld U13, B0, 0x10 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M1_TL1_END + +.L_M1_TL1: /* TL-- */ + KERNEL8x1x4 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + vldrepl.d U0, A0, 0x00 + vld U4, B0, 0x00 + vld U5, B0, 0x10 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U0, U5, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: + vldrepl.d VALPHA, $sp, 104 +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + vstelm.d D1, C2, 0x00, 0x00 + vstelm.d D1, C3, 0x00, 0x01 +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vldrepl.d U1, C1, 0x00 + vilvl.d D4, U1, U0 + vfmadd.d D6, D0, VALPHA, D4 + + vldrepl.d U2, C2, 0x00 + vldrepl.d U3, C3, 0x00 + vilvl.d D5, U3, U2 + vfmadd.d D7, D1, VALPHA, D5 + + vstelm.d D6, C0, 0x00, 0x00 + vstelm.d D6, C1, 0x00, 0x01 + vstelm.d D7, C2, 0x00, 0x00 + vstelm.d D7, C3, 0x00, 0x01 +#endif // #if defined(TRMMKERNEL) + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + slli.d T0, K, 5 + slli.d T1, LDC, 5 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/ + + vldrepl.d VALPHA, $sp, 104 + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 2 if((N & 2) && (M >> 3)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N3_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a1, a0} + * U1 = {a3, a2} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + vfmul.d D6, U2, U5 + vfmul.d D7, U3, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_TL1_END + +.L_N3_TL1: /* TL-- */ + KERNEL8x8x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x8x2_END + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + vfmadd.d D6, U2, U5, D6 + vfmadd.d D7, U3, U5, D7 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA + vfmul.d D6, D6, VALPHA + vfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + vld U4, C1, 0x00 + vld U5, C1, 0x10 + vld U6, C1, 0x20 + vld U7, C1, 0x30 + vfmadd.d D4, D4, VALPHA, U4 + vfmadd.d D5, D5, VALPHA, U5 + vfmadd.d D6, D6, VALPHA, U6 + vfmadd.d D7, D7, VALPHA, U7 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + vst D6, C1, 0x20 + vst D7, C1, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 8, considering M=4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N3_M0 + + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + vldrepl.d U5, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U5 + vfmul.d D5, U1, U5 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M4_TL1_END + +.L_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + vldrepl.d U5, B0, 0x08 + vfmadd.d D4, U0, U5, D4 + vfmadd.d D5, U1, U5, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D4, D4, VALPHA + vfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + vld U2, C1, 0x00 + vld U3, C1, 0x10 + vfmadd.d D4, D4, VALPHA, U2 + vfmadd.d D5, D5, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + /* Store C1 */ + vst D4, C1, 0x00 + vst D5, C1, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + vldrepl.d U4, B0, 0x08 + /* line 2 */ + vfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + vldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M2_TL1_END + +.L_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vldrepl.d U5, B0, 0x08 + vfmadd.d D0, U0, U4, D0 + + vfmadd.d D4, U0, U5, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + vld U1, C1, 0x00 + vfmadd.d D4, D4, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + vst D0, C0, 0x00 + vst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M1_TL1_END + +.L_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + vldrepl.d U0, A0, 0x00 + + vld U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C1, 0x00 + vilvl.d U2, U1, U0 + vfmadd.d D0, D0, VALPHA, U2 +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C1, 0x00, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 16 + */ + slli.d T0, K, 4 + slli.d T1, LDC, 4 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 2) && (M >> 3)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 3 if((N & 1) && (M >> 3)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 3)) goto L_N1_M8 */ + srai.d I, M, 3 /* I = bm >> 3 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 + * U0 = {a3, a2} + * U1 = {a1, a0} + * U2 = {a5, a4} + * U3 = {a7, a6} + */ + + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + vfmul.d D2, U2, U4 + vfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + vld U10, A0, 0x20 + vld U11, A0, 0x30 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_TL1_END +.L_N1_TL1: /* TL-- */ + KERNEL8x8x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x8x1_END + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + vld U2, A0, 0x20 + vld U3, A0, 0x30 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + vfmadd.d D2, U2, U4, D2 + vfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA + vfmul.d D2, D2, VALPHA + vfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vld U2, C0, 0x20 + vld U3, C0, 0x30 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 + vfmadd.d D2, D2, VALPHA, U2 + vfmadd.d D3, D3, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + vst D2, C0, 0x20 + vst D3, C0, 0x30 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x8 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 7 + beq ZERO,I, .L_N1_M0 + + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + vfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + + vld U8, A0, 0x00 + vld U9, A0, 0x10 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M4_TL1_END + +.L_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + vld U0, A0, 0x00 + vld U1, A0, 0x10 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + vfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA + vfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vld U1, C0, 0x10 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + vfmadd.d D1, D1, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + vst D0, C0, 0x00 + vst D1, C0, 0x10 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + + vld U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M2_TL1_END + +.L_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + vld U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vld U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + vstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + /* line 1 */ + vfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + + vldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + + vldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M1_TL1_END + +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + vldrepl.d U0, A0, 0x00 + + vldrepl.d U4, B0, 0x00 + vfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + vfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + vldrepl.d U0, C0, 0x00 + vfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + vstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 3 if((N & 1) && (M >> 3)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore regs */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f24, $sp, 40 + LD $f25, $sp, 48 + LD $f26, $sp, 56 + LD $f27, $sp, 64 + LD $f28, $sp, 72 + LD $f29, $sp, 80 + LD $f30, $sp, 88 + LD $f31, $sp, 96 + addi.d $sp, $sp, 112 + + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/param.h b/param.h index ee4640f57..e4e242d5d 100644 --- a/param.h +++ b/param.h @@ -2888,8 +2888,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_N 4 From e771be185ee3ae604ce0b6ffb0dc38258b04f866 Mon Sep 17 00:00:00 2001 From: guxiwei Date: Thu, 21 Dec 2023 14:28:06 +0800 Subject: [PATCH 520/718] Optimize copy functions with lsx. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 8 +- kernel/loongarch64/dgemm_ncopy_4_lsx.S | 185 +++++++ kernel/loongarch64/dgemm_ncopy_8_lsx.S | 283 +++++++++++ kernel/loongarch64/dgemm_tcopy_4_lsx.S | 280 +++++++++++ kernel/loongarch64/dgemm_tcopy_8_lsx.S | 597 +++++++++++++++++++++++ 5 files changed, 1349 insertions(+), 4 deletions(-) create mode 100644 kernel/loongarch64/dgemm_ncopy_4_lsx.S create mode 100644 kernel/loongarch64/dgemm_ncopy_8_lsx.S create mode 100644 kernel/loongarch64/dgemm_tcopy_4_lsx.S create mode 100644 kernel/loongarch64/dgemm_tcopy_8_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index a78c0dbc5..00cb769eb 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -59,10 +59,10 @@ SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S DGEMMKERNEL = dgemm_kernel_8x4.S -DGEMMINCOPY = ../generic/gemm_ncopy_8.c -DGEMMITCOPY = ../generic/gemm_tcopy_8.c -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPY = dgemm_ncopy_8_lsx.S +DGEMMITCOPY = dgemm_tcopy_8_lsx.S +DGEMMONCOPY = dgemm_ncopy_4_lsx.S +DGEMMOTCOPY = dgemm_tcopy_4_lsx.S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/loongarch64/dgemm_ncopy_4_lsx.S b/kernel/loongarch64/dgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..048a49af6 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_4_lsx.S @@ -0,0 +1,185 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x02 + beq J, ZERO, .L_N2 +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + addi.d J, J, -1 + beq I, ZERO, .L_I3 +.L_I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_I0 +.L_II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_2I3 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_2I3: + andi I, M, 0x01 + beq ZERO, I, .L_N1 +.L_2II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + blt ZERO, I, .L_2II1 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..30bebe8df --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S @@ -0,0 +1,283 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r21 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + PROLOGUE + push_if_used 26, 32 + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x03 + beq J, ZERO, .L_N4 +.L_J1: + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + addi.d J, J, -1 + add.d S4, S3, TL + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d TS, S7, T0 + beq I, ZERO, .L_I7 +.L_I1: + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \ + U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \ + U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \ + U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \ + U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30 + GINTERLACE v, d, D0, D4, U1, U0 + GINTERLACE v, d, D1, D5, U3, U2 + GINTERLACE v, d, D2, D6, U5, U4 + GINTERLACE v, d, D3, D7, U7, U6 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ + D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 + addi.d TD, TD, 0x80 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_I7: + andi I, M, 0x07 + beq I, ZERO, .L_I0 +.L_II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_II1 +.L_I0: + blt ZERO, J, .L_J1 +.L_N4: + andi J, N, 0x04 + beq ZERO, J, .L_N2 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + beq I, ZERO, .L_I3 +.L_4I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + addi.d TD, TD, 0x40 + + GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 + GINTERLACE v, d, D0, D2, U1, U0 + GINTERLACE v, d, D1, D3, U3, U2 + GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_4I1 +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_N2 +.L_4II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_4II1 +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_NI1 +.L_2I1: /* I-- */ + GLD v, , U0, S1, 0x00, U1, S2, 0x00 + GINTERLACE v, d, D0, D1, U1, U0 + GST v, , D0, TD, 0x00, D1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 +.L_NI1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..134066471 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S @@ -0,0 +1,280 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define P0 $r16 +#define P1 $r17 +#define P2 $r18 +#define P3 $r19 +#define T0 $r20 +#define T1 $r23 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 18, 8 + + move S0, SRC + move P0, DST + + // Find P0, P2, P3 + srai.d T0, N, 0x02 + slli.d T0, T0, 0x02 + srai.d T1, N, 0x01 + slli.d T1, T1, 0x01 + mul.d T0, M, T0 + mul.d T1, M, T1 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x03 + add.d P2, DST, T0 + add.d P3, DST, T1 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x02 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x05 + beq ZERO, J, .L_M3 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x02 + addi.d J, J, -1 + beq ZERO, I, .L_N3 +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + add.d P1, P1, T1 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P2, P2, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + fst.d F2, P3, 0x10 + fst.d F3, P3, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P3, P3, 0x20 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_2N3 + +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_2I1 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P2, P2, 0x20 + +.L_2N1: + addi.d I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P3, P3, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + move P1, P0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_1N3 + +.L_1I1: + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + + addi.d S1, S1, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_1I1 + +.L_1N3: + andi I, N, 0x02 + beq I, ZERO, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P2, 0x00 + fst.d F1, P2, 0x08 + + addi.d S1, S1, 0x10 + addi.d P2, P2, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq I, ZERO, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P3, 0x00 + +.L_M0: + pop_if_used 18, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..a7e3ef69c --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S @@ -0,0 +1,597 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x03 + srai.d T1, N, 0x02 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x02 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x01 + slli.d T0, T0, 0x01 + mul.d P4, M, T0 + slli.d P4, P4, 0x03 + add.d P4, DST, P4 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x06 + beq ZERO, J, .L_M7 +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x03 + addi.d J, J, -1 + beq ZERO, I, .L_N7 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S5, 0x20 + vld U3, S5, 0x30 + vld U4, S6, 0x00 + vld U5, S6, 0x10 + vld U6, S6, 0x20 + vld U7, S6, 0x30 + + vst U0, P1, 0x100 + vst U1, P1, 0x110 + vst U2, P1, 0x120 + vst U3, P1, 0x130 + vst U4, P1, 0x140 + vst U5, P1, 0x150 + vst U6, P1, 0x160 + vst U7, P1, 0x170 + + vld U0, S7, 0x00 + vld U1, S7, 0x10 + vld U2, S7, 0x20 + vld U3, S7, 0x30 + vld U4, S8, 0x00 + vld U5, S8, 0x10 + vld U6, S8, 0x20 + vld U7, S8, 0x30 + + vst U0, P1, 0x180 + vst U1, P1, 0x190 + vst U2, P1, 0x1a0 + vst U3, P1, 0x1b0 + vst U4, P1, 0x1c0 + vst U5, P1, 0x1d0 + vst U6, P1, 0x1e0 + vst U7, P1, 0x1f0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + vld U0, S5, 0x00 + vld U1, S5, 0x10 + vld U2, S6, 0x00 + vld U3, S6, 0x10 + vld U4, S7, 0x00 + vld U5, S7, 0x10 + vld U6, S8, 0x00 + vld U7, S8, 0x10 + + vst U0, P2, 0x80 + vst U1, P2, 0x90 + vst U2, P2, 0xa0 + vst U3, P2, 0xb0 + vst U4, P2, 0xc0 + vst U5, P2, 0xd0 + vst U6, P2, 0xe0 + vst U7, P2, 0xf0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P2, P2, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + vld U4, S5, 0x00 + vld U5, S6, 0x00 + vld U6, S7, 0x00 + vld U7, S8, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + vst U4, P3, 0x40 + vst U5, P3, 0x50 + vst U6, P3, 0x60 + vst U7, P3, 0x70 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P3, P3, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + fst.d F4, P4, 0x20 + fst.d F5, P4, 0x28 + + fst.d F6, P4, 0x30 + fst.d F7, P4, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P4, P4, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x03 + beq ZERO, I, .L_4N7 +.L_4I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + vld U0, S3, 0x00 + vld U1, S3, 0x10 + vld U2, S3, 0x20 + vld U3, S3, 0x30 + vld U4, S4, 0x00 + vld U5, S4, 0x10 + vld U6, S4, 0x20 + vld U7, S4, 0x30 + + vst U0, P1, 0x80 + vst U1, P1, 0x90 + vst U2, P1, 0xa0 + vst U3, P1, 0xb0 + vst U4, P1, 0xc0 + vst U5, P1, 0xd0 + vst U6, P1, 0xe0 + vst U7, P1, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + vld U4, S3, 0x00 + vld U5, S3, 0x10 + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + vst U4, P2, 0x40 + vst U5, P2, 0x50 + vst U6, P2, 0x60 + vst U7, P2, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P2, P2, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + vst U2, P3, 0x20 + vst U3, P3, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P3, P3, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + fst.d F2, P4, 0x10 + fst.d F3, P4, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P4, P4, 0x20 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x03 + beq ZERO, I, .L_2N7 +.L_2I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + vst U4, P1, 0x40 + vst U5, P1, 0x50 + vst U6, P1, 0x60 + vst U7, P1, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + vst U2, P2, 0x20 + vst U3, P2, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P2, P2, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, P3, 0x00 + vst U1, P3, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P3, P3, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P4, P4, 0x10 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x03 + beq ZERO, I, .L_1N7 +.L_1I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, P1, 0x00 + vst U1, P1, 0x10 + vst U2, P1, 0x20 + vst U3, P1, 0x30 + + addi.d S1, S1, 0x40 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, P2, 0x00 + vst U1, P2, 0x10 + + addi.d S1, S1, 0x20 + addi.d P2, P2, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + vld U0, S1, 0x00 + vst U0, P3, 0x00 + + addi.d S1, S1, 0x10 + addi.d P3, P3, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P4, 0x00 + + addi.d S1, S1, 0x08 + addi.d P4, P4, 0x08 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE From 06fd5b5995e66e7b54bb4b8496a3b946cd56212e Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 10:44:02 +0800 Subject: [PATCH 521/718] loongarch64: Add and Refine asum optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 +- kernel/loongarch64/asum_lasx.S | 257 ++++++++++++++++ kernel/loongarch64/asum_lsx.S | 258 ++++++++++++++++ kernel/loongarch64/casum_lasx.S | 329 +++++++++++++++++++++ kernel/loongarch64/casum_lsx.S | 358 +++++++++++++++++++++++ kernel/loongarch64/dasum_lasx.S | 148 ---------- kernel/loongarch64/dasum_lsx.S | 158 ---------- kernel/loongarch64/sasum_lasx.S | 157 ---------- kernel/loongarch64/sasum_lsx.S | 148 ---------- 10 files changed, 1210 insertions(+), 615 deletions(-) create mode 100644 kernel/loongarch64/asum_lasx.S create mode 100644 kernel/loongarch64/asum_lsx.S create mode 100644 kernel/loongarch64/casum_lasx.S create mode 100644 kernel/loongarch64/casum_lsx.S delete mode 100644 kernel/loongarch64/dasum_lasx.S delete mode 100644 kernel/loongarch64/dasum_lsx.S delete mode 100644 kernel/loongarch64/sasum_lasx.S delete mode 100644 kernel/loongarch64/sasum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 00cb769eb..201427dcd 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -49,8 +49,10 @@ DAXPBYKERNEL = daxpby_lsx.S SSUMKERNEL = sum_lsx.S DSUMKERNEL = sum_lsx.S -SASUMKERNEL = sasum_lsx.S -DASUMKERNEL = dasum_lsx.S +SASUMKERNEL = asum_lsx.S +DASUMKERNEL = asum_lsx.S +CASUMKERNEL = casum_lsx.S +ZASUMKERNEL = casum_lsx.S SROTKERNEL = rot_lsx.S DROTKERNEL = rot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index e4c45e1fa..e822cb630 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -49,8 +49,10 @@ DAXPBYKERNEL = daxpby_lasx.S SSUMKERNEL = sum_lasx.S DSUMKERNEL = sum_lasx.S -SASUMKERNEL = sasum_lasx.S -DASUMKERNEL = dasum_lasx.S +SASUMKERNEL = asum_lasx.S +DASUMKERNEL = asum_lasx.S +CASUMKERNEL = casum_lasx.S +ZASUMKERNEL = casum_lasx.S SROTKERNEL = rot_lasx.S DROTKERNEL = rot_lasx.S diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S new file mode 100644 index 000000000..9a2c031f3 --- /dev/null +++ b/kernel/loongarch64/asum_lasx.S @@ -0,0 +1,257 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define VT0 $xr23 +#define VT1 $xr22 +#define res1 $xr16 +#define res2 $xr17 +#define res0 $xr18 +#define neg1 $xr19 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfcmp.clt.s VT0, VX0, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvfadd.s res1, VX0, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/asum_lsx.S b/kernel/loongarch64/asum_lsx.S new file mode 100644 index 000000000..512b01404 --- /dev/null +++ b/kernel/loongarch64/asum_lsx.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define VT0 $vr23 +#define VT1 $vr22 +#define res1 $vr16 +#define res2 $vr17 +#define res0 $vr18 +#define neg1 $vr19 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD $f12, X, 0 * SIZE + FABS $f12, $f12 + ADD $f16, $f12, $f16 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lasx.S b/kernel/loongarch64/casum_lasx.S new file mode 100644 index 000000000..caf0ff969 --- /dev/null +++ b/kernel/loongarch64/casum_lasx.S @@ -0,0 +1,329 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res0 $xr19 +#define neg1 $xr20 +#define VT0 $xr21 +#define VT1 $xr22 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + xvreplgr2vr.d neg1, t1 + xvffint.d.l neg1, neg1 +#else + li.w t1, -1 + xvreplgr2vr.w neg1, t1 + xvffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfmul.d VX0, neg1, VX2 + xvfmul.d VX1, neg1, VX3 + xvfcmp.clt.d VT0, VX2, res0 + xvfcmp.clt.d VT1, VX3, res0 + xvbitsel.v VX2, VX2, VX0, VT0 + xvbitsel.v VX3, VX3, VX1, VT1 + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VX2, neg1, VX0 + xvfmul.d VX3, neg1, VX1 + xvfcmp.clt.d VT0, VX0, res0 + xvfcmp.clt.d VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfmul.s VX2, neg1, VX0 + xvfmul.s VX3, neg1, VX1 + xvfcmp.clt.s VT0, VX0, res0 + xvfcmp.clt.s VT1, VX1, res0 + xvbitsel.v VX0, VX0, VX2, VT0 + xvbitsel.v VX1, VX1, VX3, VT1 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/casum_lsx.S b/kernel/loongarch64/casum_lsx.S new file mode 100644 index 000000000..4822f2080 --- /dev/null +++ b/kernel/loongarch64/casum_lsx.S @@ -0,0 +1,358 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res0 $vr19 +#define neg1 $vr20 +#define VT0 $vr21 +#define VT1 $vr22 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 +#ifdef DOUBLE + li.d t1, -1 + vreplgr2vr.d neg1, t1 + vffint.d.l neg1, neg1 +#else + li.w t1, -1 + vreplgr2vr.w neg1, t1 + vffint.s.w neg1, neg1 +#endif + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfmul.d VX0, neg1, VX2 + vfmul.d VX1, neg1, VX3 + vfcmp.clt.d VT0, VX2, res0 + vfcmp.clt.d VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + vld VX0, X, 8 * SIZE + vld VX1, X, 12 * SIZE + addi.d I, I, -1 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res3, VX1, VX0 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmul.d VX2, neg1, VX0 + vfmul.d VX3, neg1, VX1 + vfcmp.clt.d VT0, VX0, res0 + vfcmp.clt.d VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfmul.s VX2, neg1, VX0 + vfmul.s VX3, neg1, VX1 + vfcmp.clt.s VT0, VX0, res0 + vfcmp.clt.s VT1, VX1, res0 + vbitsel.v VX0, VX0, VX2, VT0 + vbitsel.v VX1, VX1, VX3, VT1 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfmul.s VX0, neg1, VX2 + vfmul.s VX1, neg1, VX3 + vfcmp.clt.s VT0, VX2, res0 + vfcmp.clt.s VT1, VX3, res0 + vbitsel.v VX2, VX2, VX0, VT0 + vbitsel.v VX3, VX3, VX1, VT1 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS a1, a1 + FABS a2, a2 + addi.d I, I, -1 + ADD a1, a1, a2 + ADD s1, a1, s1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + MOV $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/dasum_lasx.S b/kernel/loongarch64/dasum_lasx.S deleted file mode 100644 index 49de98c40..000000000 --- a/kernel/loongarch64/dasum_lasx.S +++ /dev/null @@ -1,148 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define VT0 $xr23 -#define VT1 $xr22 -#define res1 $xr16 -#define res2 $xr17 -#define res0 $xr18 -#define neg1 $xr19 - - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - xvxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d t1, -1 - xvreplgr2vr.d neg1, t1 - xvffint.d.l neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 4 * SIZE - xvfmul.d VX2, neg1, VX0 - xvfmul.d VX3, neg1, VX1 - xvfcmp.clt.d VT0, VX0, res0 - xvfcmp.clt.d VT1, VX1, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvbitsel.v VX1, VX1, VX3, VT1 - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvfmul.d VX2, neg1, VX0 - xvfmul.d VX3, neg1, VX1 - xvfcmp.clt.d VT0, VX0, res0 - xvfcmp.clt.d VT1, VX1, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvbitsel.v VX1, VX1, VX3, VT1 - xvfadd.d res2, VX0, VX1 - xvfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/dasum_lsx.S b/kernel/loongarch64/dasum_lsx.S deleted file mode 100644 index 94750815e..000000000 --- a/kernel/loongarch64/dasum_lsx.S +++ /dev/null @@ -1,158 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define VT0 $vr23 -#define VT1 $vr22 -#define res1 $vr16 -#define res2 $vr17 -#define res0 $vr18 -#define neg1 $vr19 - - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - vxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d t1, -1 - vreplgr2vr.d neg1, t1 - vffint.d.l neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 2 * SIZE - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - vfmul.d VX2, neg1, VX0 - vfmul.d VX3, neg1, VX1 - vfcmp.clt.d VT0, VX0, res0 - vfcmp.clt.d VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.d res2, VX0, VX1 - vfadd.d res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.d $f12, X, 0 * SIZE - fabs.d $f12, $f12 - fadd.d $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.d $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sasum_lasx.S b/kernel/loongarch64/sasum_lasx.S deleted file mode 100644 index a452701aa..000000000 --- a/kernel/loongarch64/sasum_lasx.S +++ /dev/null @@ -1,157 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $xr12 -#define VX1 $xr13 -#define VX2 $xr14 -#define VX3 $xr15 -#define VT0 $xr23 -#define VT1 $xr22 -#define res1 $xr16 -#define res2 $xr17 -#define res0 $xr18 -#define neg1 $xr19 - - PROLOGUE - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 - xvxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.w t1, -1 - xvreplgr2vr.w neg1, t1 - xvffint.s.w neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - xvld VX0, X, 0 * SIZE - xvfmul.s VX2, neg1, VX0 - xvfcmp.clt.s VT0, VX0, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvfadd.s res1, VX0, res1 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfmul.s VX2, neg1, VX0 - xvfcmp.clt.s VT0, VX0, res0 - xvbitsel.v VX0, VX0, VX2, VT0 - xvfadd.s res1, VX0, res1 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - xvfadd.s res2, res1, res2 - xvpickve.w VX1, res1, 1 - xvpickve.w VX2, res1, 2 - xvpickve.w VX3, res1, 3 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 - xvfadd.s res1, VX0, res1 - xvfadd.s res1, VX1, res1 - xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/sasum_lsx.S b/kernel/loongarch64/sasum_lsx.S deleted file mode 100644 index 87026a144..000000000 --- a/kernel/loongarch64/sasum_lsx.S +++ /dev/null @@ -1,148 +0,0 @@ -#define ASSEMBLER -#include "common.h" -#define N $r4 -#define X $r5 -#define INCX $r6 -#define I $r17 -#define TEMP $r18 -#define t1 $r15 -#define t2 $r12 -#define t3 $r13 -#define t4 $r14 -#define VX0 $vr12 -#define VX1 $vr13 -#define VX2 $vr14 -#define VX3 $vr15 -#define VT0 $vr23 -#define VT1 $vr22 -#define res1 $vr16 -#define res2 $vr17 -#define res0 $vr18 -#define neg1 $vr19 - - PROLOGUE - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 - vxor.v res0, res0, res0 - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.w t1, -1 - vreplgr2vr.w neg1, t1 - vffint.s.w neg1, neg1 - li.d TEMP, SIZE - slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bge $r0, I, .L13 - .align 3 - -.L11: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VX2, neg1, VX0 - vfmul.s VX3, neg1, VX1 - vfcmp.clt.s VT0, VX0, res0 - vfcmp.clt.s VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L11 - .align 3 - -.L12: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L13: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L14: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L14 - b .L999 - .align 3 - -.L20: - bge $r0, I, .L23 - .align 3 - -.L21: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vfmul.s VX2, neg1, VX0 - vfmul.s VX3, neg1, VX1 - vfcmp.clt.s VT0, VX0, res0 - vfcmp.clt.s VT1, VX1, res0 - vbitsel.v VX0, VX0, VX2, VT0 - vbitsel.v VX1, VX1, VX3, VT1 - vfadd.s res2, VX0, VX1 - vfadd.s res1, res1, res2 - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - vreplvei.w VX1, res1, 1 - vreplvei.w VX2, res1, 2 - vreplvei.w VX3, res1, 3 - vfadd.s res1, VX1, res1 - vfadd.s res1, VX2, res1 - vfadd.s res1, VX3, res1 - .align 3 - -.L23: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L24: - fld.s $f12, X, 0 * SIZE - fabs.s $f12, $f12 - fadd.s $f16, $f12, $f16 - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L24 - .align 3 - -.L999: - fmov.s $f0, $f16 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file From 0753848e03e1298c162386df467f78bc15851cc4 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 16:54:01 +0800 Subject: [PATCH 522/718] loongarch64: Refine and add axpy optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 +- .../loongarch64/{daxpy_lasx.S => axpy_lasx.S} | 237 +++++- .../loongarch64/{daxpy_lsx.S => axpy_lsx.S} | 254 ++++++- kernel/loongarch64/caxpy_lasx.S | 707 ++++++++++++++++++ kernel/loongarch64/caxpy_lsx.S | 679 +++++++++++++++++ kernel/loongarch64/saxpy_lasx.S | 323 -------- kernel/loongarch64/saxpy_lsx.S | 338 --------- 8 files changed, 1839 insertions(+), 711 deletions(-) rename kernel/loongarch64/{daxpy_lasx.S => axpy_lasx.S} (52%) rename kernel/loongarch64/{daxpy_lsx.S => axpy_lsx.S} (53%) create mode 100644 kernel/loongarch64/caxpy_lasx.S create mode 100644 kernel/loongarch64/caxpy_lsx.S delete mode 100644 kernel/loongarch64/saxpy_lasx.S delete mode 100644 kernel/loongarch64/saxpy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 201427dcd..bdde126ad 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -40,8 +40,10 @@ DCOPYKERNEL = copy_lsx.S SSWAPKERNEL = swap_lsx.S DSWAPKERNEL = swap_lsx.S -SAXPYKERNEL = saxpy_lsx.S -DAXPYKERNEL = daxpy_lsx.S +SAXPYKERNEL = axpy_lsx.S +DAXPYKERNEL = axpy_lsx.S +CAXPYKERNEL = caxpy_lsx.S +ZAXPYKERNEL = caxpy_lsx.S SAXPBYKERNEL = saxpby_lsx.S DAXPBYKERNEL = daxpby_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index e822cb630..7642b2a4d 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -40,8 +40,10 @@ DCOPYKERNEL = copy_lasx.S SSWAPKERNEL = swap_lasx.S DSWAPKERNEL = swap_lasx.S -SAXPYKERNEL = saxpy_lasx.S -DAXPYKERNEL = daxpy_lasx.S +SAXPYKERNEL = axpy_lasx.S +DAXPYKERNEL = axpy_lasx.S +CAXPYKERNEL = caxpy_lasx.S +ZAXPYKERNEL = caxpy_lasx.S SAXPBYKERNEL = saxpby_lasx.S DAXPBYKERNEL = daxpby_lasx.S diff --git a/kernel/loongarch64/daxpy_lasx.S b/kernel/loongarch64/axpy_lasx.S similarity index 52% rename from kernel/loongarch64/daxpy_lasx.S rename to kernel/loongarch64/axpy_lasx.S index bafd871ab..707fd09b5 100644 --- a/kernel/loongarch64/daxpy_lasx.S +++ b/kernel/loongarch64/axpy_lasx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define XX $r5 #define YY $r6 @@ -35,16 +62,20 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 movgr2fr.d a2, TEMP - ffint.d.l a2, a2 - fcmp.ceq.d $fcc0, ALPHA, a1 + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L999 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA +#ifdef DOUBLE xvreplgr2vr.d VXA, t1 +#else + xvreplgr2vr.w VXA, t1 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 @@ -56,11 +87,12 @@ .L11: bge $r0, I, .L113 - fcmp.ceq.d $fcc0, ALPHA, a2 + CMPEQ $fcc0, ALPHA, a2 bceqz $fcc0, .L112 .align 3 .L111: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE @@ -70,6 +102,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfadd.s VX2, VX0, VX2 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L111 @@ -77,6 +116,7 @@ .align 3 .L112: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE @@ -86,6 +126,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + addi.d I, I, -1 + xvfmadd.s VX2, VX0, VXA, VX2 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L112 @@ -97,11 +144,11 @@ .align 3 .L114: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE addi.d Y, Y, SIZE blt $r0, I, .L114 @@ -114,6 +161,7 @@ .align 3 .L121: +#ifdef DOUBLE xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -158,6 +206,50 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE blt $r0, I, .L121 @@ -169,11 +261,11 @@ .align 3 .L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE add.d Y, Y, INCY blt $r0, I, .L123 @@ -185,6 +277,7 @@ .align 3 .L211: +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -217,6 +310,37 @@ addi.d I, I, -1 xvst VX3, Y, 4 * SIZE addi.d Y, Y, 8 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + addi.d Y, Y, 8 * SIZE +#endif blt $r0, I, .L211 .align 3 @@ -226,11 +350,11 @@ .align 3 .L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX addi.d Y, Y, SIZE blt $r0, I, .L213 @@ -243,6 +367,7 @@ .align 3 .L222: +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -309,6 +434,73 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmadd.s VX2, VX0, VXA, VX2 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L222 .align 3 @@ -319,15 +511,14 @@ .align 3 .L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L224 - b .L999 .align 3 .L999: diff --git a/kernel/loongarch64/daxpy_lsx.S b/kernel/loongarch64/axpy_lsx.S similarity index 53% rename from kernel/loongarch64/daxpy_lsx.S rename to kernel/loongarch64/axpy_lsx.S index fc88f0bb9..0d74e2bce 100644 --- a/kernel/loongarch64/daxpy_lsx.S +++ b/kernel/loongarch64/axpy_lsx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define XX $r5 #define YY $r6 @@ -35,16 +62,20 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + FFINT a1, a1 movgr2fr.d a2, TEMP - ffint.d.l a2, a2 - fcmp.ceq.d $fcc0, ALPHA, a1 + FFINT a2, a2 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L999 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA +#ifdef DOUBLE vreplgr2vr.d VXA, t1 +#else + vreplgr2vr.w VXA, t1 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 @@ -56,11 +87,12 @@ .L11: bge $r0, I, .L113 - fcmp.ceq.d $fcc0, ALPHA, a2 + CMPEQ $fcc0, ALPHA, a2 bceqz $fcc0, .L112 .align 3 .L111: +#ifdef DOUBLE vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE vld VX1, X, 2 * SIZE @@ -75,16 +107,27 @@ vld VX3, Y, 6 * SIZE vfadd.d VX2, VX0, VX2 vfadd.d VX3, VX1, VX3 - addi.d I, I, -1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfadd.s VX2, VX0, VX2 + vfadd.s VX3, VX1, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L111 b .L113 .align 3 .L112: +#ifdef DOUBLE vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE vld VX1, X, 2 * SIZE @@ -104,6 +147,19 @@ vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE addi.d Y, Y, 8 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX0, VXA, VX2 + vfmadd.s VX3, VX1, VXA, VX3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 +#endif blt $r0, I, .L112 .align 3 @@ -113,11 +169,11 @@ .align 3 .L114: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE addi.d Y, Y, SIZE blt $r0, I, .L114 @@ -130,6 +186,7 @@ .align 3 .L121: +#ifdef DOUBLE vld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY @@ -180,6 +237,54 @@ add.d YY, YY, INCY addi.d X, X, 8 * SIZE addi.d I, I, -1 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + vld VX1, X, 4 * SIZE + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE +#endif blt $r0, I, .L121 .align 3 @@ -189,11 +294,11 @@ .align 3 .L123: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE addi.d X, X, SIZE add.d Y, Y, INCY blt $r0, I, .L123 @@ -205,6 +310,7 @@ .align 3 .L211: +#ifdef DOUBLE vld VX2, Y, 0 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -242,6 +348,39 @@ vfmadd.d VX3, VX1, VXA, VX3 addi.d I, I, -1 vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmadd.s VX2, VX0, VXA, VX2 + vld VX3, Y, 4 * SIZE + vst VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 .align 3 @@ -252,11 +391,11 @@ .align 3 .L213: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX addi.d Y, Y, SIZE blt $r0, I, .L213 @@ -269,6 +408,7 @@ .align 3 .L222: +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -337,6 +477,74 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX2, VX0, VXA, VX2 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmadd.s VX3, VX1, VXA, VX3 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L222 .align 3 @@ -347,11 +555,11 @@ .align 3 .L224: - fld.d $f12, X, 0 * SIZE - fld.d $f14, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f14, Y, 0 * SIZE addi.d I, I, -1 - fmadd.d $f14, $f12, $f0, $f14 - fst.d $f14, Y, 0 * SIZE + MADD $f14, $f12, $f0, $f14 + ST $f14, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L224 diff --git a/kernel/loongarch64/caxpy_lasx.S b/kernel/loongarch64/caxpy_lasx.S new file mode 100644 index 000000000..2b970fe70 --- /dev/null +++ b/kernel/loongarch64/caxpy_lasx.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + addi.d I, I, -1 +#else + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE +#endif + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmsub.d VX1, VXAR, x1, VX0 + xvfmadd.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfadd.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmsub.s VX1, VXAR, x1, VX0 + xvfmadd.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX2, VXAI, x1 + xvfmadd.d VX1, VXAR, x1, VX0 + xvfmsub.d VX3, x2, VXAR, VX2 + xvfadd.d x3, x3, VX1 + xvfsub.d x4, x4, VX3 +#else + xvfmul.s VX0, VXAI, x2 + xvfmul.s VX2, VXAI, x1 + xvfmadd.s VX1, VXAR, x1, VX0 + xvfmsub.s VX3, x2, VXAR, VX2 + xvfadd.s x3, x3, VX1 + xvfsub.s x4, x4, VX3 +#endif +#endif + addi.d I, I, -1 +#ifdef DOUBLE + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 +#else + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpy_lsx.S b/kernel/loongarch64/caxpy_lsx.S new file mode 100644 index 000000000..85598d0b9 --- /dev/null +++ b/kernel/loongarch64/caxpy_lsx.S @@ -0,0 +1,679 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define XX $r5 +#define YY $r6 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L10 + bcnez $fcc1, .L999 +.L10: + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + vreplgr2vr.w VXAI, t2 +#endif + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX2, Y, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 +#endif +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE +#else + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 +#endif + add.d Y, Y, INCY +#if !defined(CONJ) +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmsub.s VX1, VXAR, x1, VX0 + vfmadd.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfadd.s x4, x4, VX3 +#endif +#else +#ifdef DOUBLE + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#else + vfmul.s VX0, VXAI, x2 + vfmul.s VX2, VXAI, x1 + vfmadd.s VX1, VXAR, x1, VX0 + vfmsub.s VX3, x2, VXAR, VX2 + vfadd.s x3, x3, VX1 + vfsub.s x4, x4, VX3 +#endif +#endif +#ifdef DOUBLE + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY +#if !defined(CONJ) + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmsub.d VX1, VXAR, x1, VX0 + vfmadd.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfadd.d x4, x4, VX3 +#else + vfmul.d VX0, VXAI, x2 + vfmul.d VX2, VXAI, x1 + vfmadd.d VX1, VXAR, x1, VX0 + vfmsub.d VX3, x2, VXAR, VX2 + vfadd.d x3, x3, VX1 + vfsub.d x4, x4, VX3 +#endif + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 +#else + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 +#endif + add.d YY, YY, INCY + blt $r0, I, .L222 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 +#if !defined(CONJ) + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s3, ALPHAR, a1, s1 + MADD s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + ADD s4, s4, a4 +#else + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MADD s3, ALPHAR, a1, s1 + MSUB s4, a2, ALPHAR, s2 + ADD s3, s3, a3 + SUB s4, a4, s4 +#endif + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/saxpy_lasx.S b/kernel/loongarch64/saxpy_lasx.S deleted file mode 100644 index 609e26328..000000000 --- a/kernel/loongarch64/saxpy_lasx.S +++ /dev/null @@ -1,323 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define XX $r5 -#define YY $r6 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $xr8 -#define VX1 $xr20 -#define VX2 $xr21 -#define VX3 $xr22 -#define VXA $xr23 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L999 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - xvreplgr2vr.w VXA, t1 - - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L113 - fcmp.ceq.s $fcc0, ALPHA, a2 - bceqz $fcc0, .L112 - .align 3 - -.L111: - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - addi.d I, I, -1 - xvfadd.s VX2, VX0, VX2 - xvst VX2, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - b .L113 - .align 3 - -.L112: - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - addi.d I, I, -1 - xvfmadd.s VX2, VX0, VXA, VX2 - xvst VX2, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L112 - .align 3 - -.L113: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L114: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L114 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - move YY, Y - .align 3 - -.L121: - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmadd.s VX2, VX0, VXA, VX2 - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L212 - .align 3 - -.L211: - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfmadd.s VX2, VX0, VXA, VX2 - addi.d I, I, -1 - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - move YY, Y - .align 3 - -.L222: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmadd.s VX2, VX0, VXA, VX2 - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/saxpy_lsx.S b/kernel/loongarch64/saxpy_lsx.S deleted file mode 100644 index f47415ed6..000000000 --- a/kernel/loongarch64/saxpy_lsx.S +++ /dev/null @@ -1,338 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define XX $r5 -#define YY $r6 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define Y $r9 -#define INCY $r10 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define a1 $f12 -#define a2 $f13 -#define a3 $f14 -#define a4 $f15 -#define b1 $f16 -#define b2 $f17 -#define b3 $f18 -#define b4 $f19 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VXA $vr23 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L999 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - vreplgr2vr.w VXA, t1 - - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L113 - fcmp.ceq.s $fcc0, ALPHA, a2 - bceqz $fcc0, .L112 - .align 3 - -.L111: - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfadd.s VX2, VX0, VX2 - vfadd.s VX3, VX1, VX3 - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L113 - .align 3 - -.L112: - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmadd.s VX2, VX0, VXA, VX2 - vfmadd.s VX3, VX1, VXA, VX3 - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L113 - .align 3 - -.L113: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L114: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - addi.d Y, Y, SIZE - blt $r0, I, .L114 - b .L999 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L122 - move YY, Y - .align 3 - -.L121: - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX2, VX0, VXA, VX2 - vld VX1, X, 4 * SIZE - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX3, VX1, VXA, VX3 - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L121 - .align 3 - -.L122: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L123: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - addi.d X, X, SIZE - add.d Y, Y, INCY - blt $r0, I, .L123 - b .L999 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L212 - .align 3 - -.L211: - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmadd.s VX2, VX0, VXA, VX2 - vld VX3, Y, 4 * SIZE - vst VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - vfmadd.s VX3, VX1, VXA, VX3 - addi.d I, I, -1 - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - .align 3 - -.L212: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L213: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - addi.d Y, Y, SIZE - blt $r0, I, .L213 - b .L999 - .align 3 - -.L22: - bge $r0, I, .L223 - move YY, Y - .align 3 - -.L222: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX2, VX0, VXA, VX2 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmadd.s VX3, VX1, VXA, VX3 - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L222 - .align 3 - -.L223: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L224: - fld.s $f12, X, 0 * SIZE - fld.s $f14, Y, 0 * SIZE - addi.d I, I, -1 - fmadd.s $f14, $f12, $f0, $f14 - fst.s $f14, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L224 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE From 8785e948b534a7747611b36b46a69f5813a18cc2 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 17:04:46 +0800 Subject: [PATCH 523/718] loongarch64: Add camin optimization function. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 1 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 1 + kernel/loongarch64/camin_lasx.S | 199 +++++++++++++++++++++ kernel/loongarch64/camin_lsx.S | 211 +++++++++++++++++++++++ 4 files changed, 412 insertions(+) create mode 100644 kernel/loongarch64/camin_lasx.S create mode 100644 kernel/loongarch64/camin_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index bdde126ad..fc1766ff5 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -13,6 +13,7 @@ CAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S +CAMINKERNEL = camin_lsx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 7642b2a4d..7de9d4440 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -13,6 +13,7 @@ CAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S +CAMINKERNEL = camin_lasx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S new file mode 100644 index 000000000..d7931d30a --- /dev/null +++ b/kernel/loongarch64/camin_lasx.S @@ -0,0 +1,199 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define TEMP $r16 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define a0 $f20 +#define a1 $f21 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VT0 $xr13 +#define VT1 $xr14 +#define res0 $xr18 +#define neg1 $xr19 +#define VX0 $xr20 +#define VX1 $xr21 +#define VM0 $xr22 +#define VM1 $xr23 + + PROLOGUE + MTC s1, $r0 + xvxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s s1, a1, a0 + xvreplve0.w VM0, VM0 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + xvreplgr2vr.w neg1, I + xvffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, neg1, x1 + xvfmul.s x4, neg1, x2 + xvfcmp.clt.s VT0, x1, res0 + xvfcmp.clt.s VT1, x2, res0 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VT1 + addi.d X, X, 16 * SIZE + xvfadd.s VM1, x1, x2 + xvfmin.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfmin.s VM1, x1, x2 + xvfmin.s VM0, x3, x4 + xvfmin.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmin.s s1, s1, s2 + fmin.s s3, s3, s4 + fmin.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + add.d X, X, INCX + fmin.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S new file mode 100644 index 000000000..e9ad6b04d --- /dev/null +++ b/kernel/loongarch64/camin_lsx.S @@ -0,0 +1,211 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $f14 +#define t2 $f18 +#define t3 $f15 +#define t4 $f17 +#define s1 $f22 +#define s2 $f9 +#define s3 $f10 +#define s4 $f11 +#define TEMP $r16 +#define a0 $f20 +#define a1 $f21 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VT0 $vr13 +#define VT1 $vr14 +#define res0 $vr18 +#define neg1 $vr19 +#define VX0 $vr20 +#define VX1 $vr21 +#define VM0 $vr22 +#define VM1 $vr23 + + PROLOGUE + MTC s1, $r0 + vxor.v res0, res0, res0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s s1, a1, a0 + vreplvei.w VM0, VM0, 0 + li.d TEMP, 1 + li.w I, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + vreplgr2vr.w neg1, I + vffint.s.w neg1, neg1 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L23 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + vld VX0, X, 8 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vld VX1, X, 12 * SIZE + vfadd.s VM1, x1, x2 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, neg1, x1 + vfmul.s x4, neg1, x2 + vfcmp.clt.s VT0, x1, res0 + vfcmp.clt.s VT1, x2, res0 + addi.d X, X, 16 * SIZE + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VT1 + vfadd.s x1, x1, x2 + vfmin.s VM1, x1, VM1 + vfmin.s VM0, VM0, VM1 + blt $r0, I, .L10 + .align 3 + +.L11: + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmin.s VM1, x1, x2 + vfmin.s VM0, x3, x4 + vfmin.s VM0, VM0, VM1 + b .L23 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L23 + .align 3 + +.L21: + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s1, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + addi.d I, I, -1 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s3, t1, t3 + fld.s t1, X, 0 * SIZE + fld.s t2, X, 1 * SIZE + add.d X, X, INCX + fld.s t3, X, 0 * SIZE + fld.s t4, X, 1 * SIZE + add.d X, X, INCX + fabs.s t1, t1 + fabs.s t2, t2 + fabs.s t3, t3 + fabs.s t4, t4 + fadd.s t1, t1, t2 + fadd.s t3, t3, t4 + fmin.s s4, t1, t3 + blt $r0, I, .L21 + .align 3 + +.L22: + fmin.s s1, s1, s2 + fmin.s s3, s3, s4 + fmin.s s1, s1, s3 + .align 3 + +.L23: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + fld.s a0, X, 0 * SIZE + fld.s a1, X, 1 * SIZE + addi.d I, I, -1 + fabs.s a0, a0 + fabs.s a1, a1 + fadd.s a0, a0, a1 + add.d X, X, INCX + fmin.s s1, a0, s1 + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 2a34fb4b80d494d76da7a9ea6a3d54dffbd57f37 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 27 Dec 2023 18:17:51 +0800 Subject: [PATCH 524/718] loongarch64: Add and refine scal optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 +- kernel/loongarch64/cscal_lasx.S | 645 +++++++++++++++++++++++ kernel/loongarch64/cscal_lsx.S | 571 ++++++++++++++++++++ kernel/loongarch64/scal_lasx.S | 282 ++++++++++ kernel/loongarch64/scal_lsx.S | 301 +++++++++++ kernel/loongarch64/sscal_lasx.S | 188 ------- kernel/loongarch64/sscal_lsx.S | 194 ------- 8 files changed, 1807 insertions(+), 386 deletions(-) create mode 100644 kernel/loongarch64/cscal_lasx.S create mode 100644 kernel/loongarch64/cscal_lsx.S create mode 100644 kernel/loongarch64/scal_lasx.S create mode 100644 kernel/loongarch64/scal_lsx.S delete mode 100644 kernel/loongarch64/sscal_lasx.S delete mode 100644 kernel/loongarch64/sscal_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index fc1766ff5..7abdae55a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -4,8 +4,10 @@ SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S -SSCALKERNEL = sscal_lsx.S -DSCALKERNEL = dscal_lsx.S +SSCALKERNEL = scal_lsx.S +DSCALKERNEL = scal_lsx.S +CSCALKERNEL = cscal_lsx.S +ZSCALKERNEL = cscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 7de9d4440..13f9f23ed 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -4,8 +4,10 @@ SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S -SSCALKERNEL = sscal_lasx.S -DSCALKERNEL = dscal_lasx.S +SSCALKERNEL = scal_lasx.S +DSCALKERNEL = scal_lasx.S +CSCALKERNEL = cscal_lasx.S +ZSCALKERNEL = cscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S new file mode 100644 index 000000000..3605a6c0e --- /dev/null +++ b/kernel/loongarch64/cscal_lasx.S @@ -0,0 +1,645 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + xvreplgr2vr.d VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 +#else + xvreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + xvreplgr2vr.w VXAI, t2 + xvxor.v VXZ, VXZ, VXZ + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + xvst VXZ, X, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //alpha_r == 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAI, x2 + xvfsub.d x3, VXZ, x3 + xvfmul.d x4, VXAI, x1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAI, x2 + xvfsub.s x3, VXZ, x3 + xvfmul.s x4, VXAI, x1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + addi.d X, X, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, X, 0 * SIZE + xvst VX3, X, 8 * SIZE + addi.d X, X, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.d VXZ, X, 0, 0 + xvstelm.d VXZ, X, 1 * SIZE, 0 +#else + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + xvstelm.w VXZ, X, 0, 0 + xvstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: //alpha_r == 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAI, x2 + xvfsub.d x3, VXZ, x3 + xvfmul.d x4, VXAI, x1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAI, x2 + xvfsub.s x3, VXZ, x3 + xvfmul.s x4, VXAI, x1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d x3, VXAR, x1 + xvfmul.d x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s x3, VXAR, x1 + xvfmul.s x4, VXAR, x2 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvfmul.d VX0, VXAI, x2 + xvfmsub.d x3, VXAR, x1, VX0 + xvfmul.d VX1, VXAI, x1 + xvfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.d x3, XX, 0 * SIZE, 0 + xvstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 1 + xvstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 2 + xvstelm.d x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d x3, XX, 0 * SIZE, 3 + xvstelm.d x4, XX, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvfmul.s VX0, VXAI, x2 + xvfmsub.s x3, VXAR, x1, VX0 + xvfmul.s VX1, VXAI, x1 + xvfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + xvstelm.w x3, XX, 0 * SIZE, 0 + xvstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 1 + xvstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 2 + xvstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 3 + xvstelm.w x4, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 4 + xvstelm.w x4, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 5 + xvstelm.w x4, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 6 + xvstelm.w x4, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w x3, XX, 0 * SIZE, 7 + xvstelm.w x4, XX, 1 * SIZE, 7 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S new file mode 100644 index 000000000..f442a754f --- /dev/null +++ b/kernel/loongarch64/cscal_lsx.S @@ -0,0 +1,571 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r7 +#define INCX $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + MTG t1, ALPHAR +#ifdef DOUBLE + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 +#else + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 +#endif + vxor.v VXZ, VXZ, VXZ + srai.d I, N, 2 + bne INCX, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 + b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + +.L14: + bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + b .L111 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L111: //alpha_r == 0.0 && alpha_i == 0.0 + vst VXZ, X, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, X, 4 * SIZE + vst VXZ, X, 6 * SIZE +#else + vst VXZ, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //alpha_r == 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAI, x2 + vfsub.s x3, VXZ, x3 + vfmul.s x4, VXAI, x1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //alpha_r != 0.0 && alpha_i == 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: //alpha_r != 0.0 && alpha_i != 0.0 + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move XX, X + CMPEQ $fcc0, ALPHAR, a1 + CMPEQ $fcc1, ALPHAI, a1 + bceqz $fcc0, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 + b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + +.L24: + bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + b .L221 //alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +.L221: //alpha_r == 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.d VXZ, X, 0, 0 + vstelm.d VXZ, X, 1 * SIZE, 0 +#else + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 + add.d X, X, INCX + vstelm.w VXZ, X, 0, 0 + vstelm.w VXZ, X, 1 * SIZE, 0 +#endif + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: //alpha_r == 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAI, x2 + vfsub.d x3, VXZ, x3 + vfmul.d x4, VXAI, x1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAI, x2 + vfsub.s x3, VXZ, x3 + vfmul.s x4, VXAI, x1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: //alpha_r != 0.0 && alpha_i == 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAR, x1 + vfmul.d x4, VXAR, x2 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s x3, VXAR, x1 + vfmul.s x4, VXAR, x2 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: //alpha_r != 0.0 && alpha_i != 0.0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d VX0, VXAI, x2 + vfmsub.d x3, VXAR, x1, VX0 + vfmul.d VX1, VXAI, x1 + vfmadd.d x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.d x3, XX, 0 * SIZE, 0 + vstelm.d x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d x3, XX, 0 * SIZE, 1 + vstelm.d x4, XX, 1 * SIZE, 1 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vfmul.s VX0, VXAI, x2 + vfmsub.s x3, VXAR, x1, VX0 + vfmul.s VX1, VXAI, x1 + vfmadd.s x4, VXAR, x2, VX1 + addi.d I, I, -1 + vstelm.w x3, XX, 0 * SIZE, 0 + vstelm.w x4, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 1 + vstelm.w x4, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 2 + vstelm.w x4, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w x3, XX, 0 * SIZE, 3 + vstelm.w x4, XX, 1 * SIZE, 3 +#endif + add.d XX, XX, INCX + blt $r0, I, .L224 + b .L997 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MSUB s1, ALPHAR, a1, s1 + MADD s2, ALPHAR, a2, s2 + ST s1, X, 0 * SIZE + ST s2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/scal_lasx.S b/kernel/loongarch64/scal_lasx.S new file mode 100644 index 000000000..48e2c0718 --- /dev/null +++ b/kernel/loongarch64/scal_lasx.S @@ -0,0 +1,282 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VT0 $xr14 +#define VT1 $xr15 +#define VALPHA $xr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + CMPEQ $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + MTG TEMP, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VALPHA, TEMP +#else + xvreplgr2vr.w VALPHA, TEMP +#endif + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 +.L11: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvfmul.d VT1, VX1, VALPHA + xvstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT1, XX, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VT0, VX0, VALPHA + xvstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 4 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 5 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 6 + add.d XX, XX, INCX + xvstelm.w VT0, XX, 0, 7 +#endif + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L23: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 +.L25: + xvxor.v VX0, VX0, VX0 + xvst VX0, X, 0 * SIZE +#ifdef DOUBLE + xvst VX0, X, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L27: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + MTG TEMP, ALPHA +#ifdef DOUBLE + xvreplgr2vr.d VALPHA , TEMP +#else + xvreplgr2vr.w VALPHA , TEMP +#endif + .align 3 + +.L31: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvfmul.d VT0, VX0, VALPHA + xvfmul.d VT1, VX1, VALPHA + xvst VT0, X, 0 * SIZE + xvst VT1, X, 4 * SIZE +#else + xvfmul.s VT0, VX0, VALPHA + xvst VT0, X, 0 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L33: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + MUL a1, ALPHA, a1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/scal_lsx.S b/kernel/loongarch64/scal_lsx.S new file mode 100644 index 000000000..1ffce7db2 --- /dev/null +++ b/kernel/loongarch64/scal_lsx.S @@ -0,0 +1,301 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define ALPHA $f0 +#define X $r7 +#define INCX $r8 +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r18 +#define t3 $r15 +#define t4 $r17 +#define XX $r16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VT0 $vr14 +#define VT1 $vr15 +#define VALPHA $vr19 +#define a1 $f8 +#define a2 $f23 + + PROLOGUE + + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + movgr2fr.d a2, TEMP + FFINT a2, a2 + slli.d TEMP, TEMP, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + CMPEQ $fcc0, ALPHA, a1 + bcnez $fcc0, .L20 //ALPHA==0 + CMPEQ $fcc0, ALPHA, a2 + bcnez $fcc0, .L999 //ALPHA==1 return + srai.d I, N, 3 + beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 + MTG TEMP, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VALPHA, TEMP +#else + vreplgr2vr.w VALPHA, TEMP +#endif + move XX, X + .align 3 + +.L10: //ALPHA!=0|1 and INCX!=1 + bge $r0, I, .L32 + .align 3 + +.L11: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT0, VX0, VALPHA + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vstelm.d VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT0, XX, 0, 1 + add.d XX, XX, INCX + vfmul.d VT1, VX1, VALPHA + vstelm.d VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.d VT1, XX, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + vfmul.s VT0, VX0, VALPHA + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VT0, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT0, XX, 0, 3 + add.d XX, XX, INCX + vfmul.s VT1, VX1, VALPHA + vstelm.w VT1, XX, 0, 0 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 1 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 2 + add.d XX, XX, INCX + vstelm.w VT1, XX, 0, 3 +#endif + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L11 + b .L32 + .align 3 + +.L20: + srai.d I, N, 3 + beq INCX, TEMP, .L24 + bge $r0, I, .L22 + .align 3 + +.L21: + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + ST a1, X, 0 + add.d X, X, INCX + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L23: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + jirl $r0, $r1, 0 + .align 3 + +.L24: + bge $r0, I, .L26 /*N<8 INCX==1*/ + .align 3 + +.L25: + vxor.v VX0, VX0, VX0 + vst VX0, X, 0 * SIZE +#ifdef DOUBLE + vst VX0, X, 2 * SIZE + vst VX0, X, 4 * SIZE + vst VX0, X, 6 * SIZE +#else + vst VX0, X, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L25 + .align 3 + +.L26: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L27: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L27 + jirl $r0, $r1, 0 + .align 3 + +.L30: + bge $r0, I, .L32/*N<8 INCX==1*/ + MTG TEMP, ALPHA +#ifdef DOUBLE + vreplgr2vr.d VALPHA , TEMP +#else + vreplgr2vr.w VALPHA , TEMP +#endif + .align 3 + +.L31: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vfmul.d VT1, VX1, VALPHA + vld VX0, X, 4 * SIZE + vst VT0, X, 0 * SIZE + vst VT1, X, 2 * SIZE + vfmul.d VT0, VX0, VALPHA + vld VX1, X, 6 * SIZE + vst VT0, X, 4 * SIZE + vfmul.d VT1, VX1, VALPHA + vst VT1, X, 6 * SIZE + addi.d I, I, -1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VT0, VX0, VALPHA + vfmul.s VT1, VX1, VALPHA + addi.d I, I, -1 + vst VT0, X, 0 * SIZE + vst VT1, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + blt $r0, I, .L31 + .align 3 + +.L32: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L33: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + MUL a1, ALPHA, a1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L33 + jirl $r0, $r1, 0 + .align 3 + +.L999: + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/sscal_lasx.S b/kernel/loongarch64/sscal_lasx.S deleted file mode 100644 index 329f24659..000000000 --- a/kernel/loongarch64/sscal_lasx.S +++ /dev/null @@ -1,188 +0,0 @@ -#define ASSEMBLER -#include "common.h" - -#define N $r4 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define XX $r16 -#define VX0 $xr12 -#define VX1 $xr13 -#define VT0 $xr14 -#define VT1 $xr15 -#define VALPHA $xr19 -#define a1 $f8 -#define a2 $f23 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L20 //ALPHA==0 - fcmp.ceq.s $fcc0, ALPHA, a2 - bcnez $fcc0, .L999 //ALPHA==1 return - srai.d I, N, 3 - beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 - movfr2gr.s TEMP, ALPHA - xvreplgr2vr.w VALPHA, TEMP - move XX, X - -.L10: //ALPHA!=0|1 and INCX!=1 - bge $r0, I, .L32 - .align 3 -.L11: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvfmul.s VT0, VX0, VALPHA - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L11 - b .L32 - .align 3 - -.L20: - srai.d I, N, 3 - beq INCX, TEMP, .L24 - bge $r0, I, .L22 - .align 3 - -.L21: - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L23: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L23 - jirl $r0, $r1, 0 - .align 3 - -.L24: - bge $r0, I, .L26 /*N<8 INCX==1*/ - .align 3 -.L25: - xvxor.v VX0, VX0, VX0 - xvst VX0, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L25 - .align 3 - -.L26: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L27: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L27 - jirl $r0, $r1, 0 - .align 3 - -.L30: - bge $r0, I, .L32/*N<8 INCX==1*/ - movfr2gr.s TEMP, ALPHA - xvreplgr2vr.w VALPHA , TEMP - .align 3 - -.L31: - xvld VX0, X, 0 * SIZE - addi.d I, I, -1 - xvfmul.s VT0, VX0, VALPHA - xvst VT0, X, 0 * SIZE - addi.d X, X, 8 * SIZE - blt $r0, I, .L31 - .align 3 - -.L32: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L33: - fld.s a1, X, 0 * SIZE - addi.d I, I, -1 - fmul.s a1, ALPHA, a1 - fst.s a1, X, 0 * SIZE - add.d X, X, INCX - blt $r0, I, .L33 - jirl $r0, $r1, 0 - .align 3 - -.L999: - jirl $r0, $r1, 0x0 - - EPILOGUE diff --git a/kernel/loongarch64/sscal_lsx.S b/kernel/loongarch64/sscal_lsx.S deleted file mode 100644 index d0ea1307d..000000000 --- a/kernel/loongarch64/sscal_lsx.S +++ /dev/null @@ -1,194 +0,0 @@ -#define ASSEMBLER -#include "common.h" - -#define N $r4 -#define ALPHA $f0 -#define X $r7 -#define INCX $r8 -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r18 -#define t3 $r15 -#define t4 $r17 -#define XX $r16 -#define VX0 $vr12 -#define VX1 $vr13 -#define VT0 $vr14 -#define VT1 $vr15 -#define VALPHA $vr19 -#define a1 $f8 -#define a2 $f23 - - PROLOGUE - - bge $r0, N, .L999 - bge $r0, INCX, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - movgr2fr.d a2, TEMP - ffint.s.l a2, a2 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L20 //ALPHA==0 - fcmp.ceq.s $fcc0, ALPHA, a2 - bcnez $fcc0, .L999 //ALPHA==1 return - srai.d I, N, 3 - beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1 - movfr2gr.s TEMP, ALPHA - vreplgr2vr.w VALPHA, TEMP - move XX, X - .align 3 - -.L10: //ALPHA!=0|1 and INCX!=1 - bge $r0, I, .L32 - .align 3 -.L11: - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - vfmul.s VT0, VX0, VALPHA - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vfmul.s VT1, VX1, VALPHA - vstelm.w VT1, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT1, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT1, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT1, XX, 0, 3 - add.d XX, XX, INCX - addi.d I, I, -1 - blt $r0, I, .L11 - b .L32 - .align 3 - -.L20: - srai.d I, N, 3 - beq INCX, TEMP, .L24 - bge $r0, I, .L22 - .align 3 - -.L21: - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - fst.s a1, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - blt $r0, I, .L21 - .align 3 - -.L22: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L23: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - add.d X, X, INCX - blt $r0, I, .L23 - jirl $r0, $r1, 0 - .align 3 - -.L24: - bge $r0, I, .L26 /*N<8 INCX==1*/ - .align 3 -.L25: - vxor.v VX0, VX0, VX0 - vst VX0, X, 0 * SIZE - vst VX0, X, 4 * SIZE - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L25 - .align 3 - -.L26: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L27: - fst.s a1, X, 0 * SIZE - addi.d I, I, -1 - addi.d X, X, SIZE - blt $r0, I, .L27 - jirl $r0, $r1, 0 - .align 3 - -.L30: - bge $r0, I, .L32/*N<8 INCX==1*/ - movfr2gr.s TEMP, ALPHA - vreplgr2vr.w VALPHA , TEMP - .align 3 - -.L31: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VT0, VX0, VALPHA - vfmul.s VT1, VX1, VALPHA - addi.d I, I, -1 - vst VT0, X, 0 * SIZE - vst VT1, X, 4 * SIZE - addi.d X, X, 8 * SIZE - blt $r0, I, .L31 - .align 3 - -.L32: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 -.L33: - fld.s a1, X, 0 * SIZE - addi.d I, I, -1 - fmul.s a1, ALPHA, a1 - fst.s a1, X, 0 * SIZE - add.d X, X, INCX - blt $r0, I, .L33 - jirl $r0, $r1, 0 - .align 3 - -.L999: - jirl $r0, $r1, 0x0 - - EPILOGUE From 65a0aeb128a46b75b189d3a21b619d005351b2d2 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 17:45:17 +0800 Subject: [PATCH 525/718] loongarch64: Add c/zcopy optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/ccopy_lasx.S | 386 +++++++++++++++++++++ kernel/loongarch64/ccopy_lsx.S | 411 +++++++++++++++++++++++ 4 files changed, 801 insertions(+) create mode 100644 kernel/loongarch64/ccopy_lasx.S create mode 100644 kernel/loongarch64/ccopy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 7abdae55a..2aa68af67 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -39,6 +39,8 @@ IDAMINKERNEL = iamin_lsx.S SCOPYKERNEL = copy_lsx.S DCOPYKERNEL = copy_lsx.S +CCOPYKERNEL = ccopy_lsx.S +ZCOPYKERNEL = ccopy_lsx.S SSWAPKERNEL = swap_lsx.S DSWAPKERNEL = swap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 13f9f23ed..3bcec2c62 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -39,6 +39,8 @@ IDAMINKERNEL = iamin_lasx.S SCOPYKERNEL = copy_lasx.S DCOPYKERNEL = copy_lasx.S +CCOPYKERNEL = ccopy_lasx.S +ZCOPYKERNEL = ccopy_lasx.S SSWAPKERNEL = swap_lasx.S DSWAPKERNEL = swap_lasx.S diff --git a/kernel/loongarch64/ccopy_lasx.S b/kernel/loongarch64/ccopy_lasx.S new file mode 100644 index 000000000..fbc5d96bc --- /dev/null +++ b/kernel/loongarch64/ccopy_lasx.S @@ -0,0 +1,386 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: + bge $r0, I, .L122 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX0, Y, 0 * SIZE, 2 + xvstelm.d VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 0 + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX1, Y, 0 * SIZE, 2 + xvstelm.d VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 0 + xvstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX2, Y, 0 * SIZE, 2 + xvstelm.d VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 0 + xvstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.d VX3, Y, 0 * SIZE, 2 + xvstelm.d VX3, Y, 1 * SIZE, 3 +#else + xvld VX1, X, 8 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 2 + xvstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 4 + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX0, Y, 0 * SIZE, 6 + xvstelm.w VX0, Y, 1 * SIZE, 7 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 0 + xvstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 2 + xvstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 4 + xvstelm.w VX1, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + xvstelm.w VX1, Y, 0 * SIZE, 6 + xvstelm.w VX1, Y, 1 * SIZE, 7 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE + xvst VX2, Y, 8 * SIZE + xvst VX3, Y, 12 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 8 * SIZE +#endif + addi.d I, I, -1 + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/ccopy_lsx.S b/kernel/loongarch64/ccopy_lsx.S new file mode 100644 index 000000000..4c4d880f1 --- /dev/null +++ b/kernel/loongarch64/ccopy_lsx.S @@ -0,0 +1,411 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11:// INCX==1 and INCY==1 + bge $r0, I, .L112 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + addi.d I, I, -1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + vst VX2, Y, 12 * SIZE + vst VX3, Y, 14 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + addi.d I, I, -1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + vstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX1, Y, 0 * SIZE, 0 + vstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX2, Y, 0 * SIZE, 0 + vstelm.d VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.d VX3, Y, 0 * SIZE, 0 + vstelm.d VX3, Y, 1 * SIZE, 1 +#else + vld VX1, X, 4 * SIZE + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX0, Y, 0 * SIZE, 2 + vstelm.w VX0, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 0 + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX1, Y, 0 * SIZE, 2 + vstelm.w VX1, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 0 + vstelm.w VX2, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX2, Y, 0 * SIZE, 2 + vstelm.w VX2, Y, 1 * SIZE, 3 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 0 + vstelm.w VX3, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + vstelm.w VX3, Y, 0 * SIZE, 2 + vstelm.w VX3, Y, 1 * SIZE, 3 +#endif + add.d Y, Y, INCY + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 8 * SIZE + vst VX1, Y, 10 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vst VX0, Y, 12 * SIZE + vst VX1, Y, 14 * SIZE +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE + vst VX2, Y, 8 * SIZE + vst VX3, Y, 12 * SIZE +#endif + addi.d Y, Y, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From d97272cb35af5849515b6a5850ddfe642e29430f Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 19:09:18 +0800 Subject: [PATCH 526/718] loongarch64: Add c/zdot optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/cdot_lasx.S | 565 +++++++++++++++++++++++ kernel/loongarch64/cdot_lsx.S | 397 ++++++++++++++++ 4 files changed, 966 insertions(+) create mode 100644 kernel/loongarch64/cdot_lasx.S create mode 100644 kernel/loongarch64/cdot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 2aa68af67..c70120f9a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -3,6 +3,8 @@ ifndef NO_LSX SDOTKERNEL = dot_lsx.S DSDOTKERNEL = dot_lsx.S DDOTKERNEL = dot_lsx.S +CDOTKERNEL = cdot_lsx.S +ZDOTKERNEL = cdot_lsx.S SSCALKERNEL = scal_lsx.S DSCALKERNEL = scal_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 3bcec2c62..98673ae09 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -3,6 +3,8 @@ ifndef NO_LASX SDOTKERNEL = dot_lasx.S DSDOTKERNEL = dot_lasx.S DDOTKERNEL = dot_lasx.S +CDOTKERNEL = cdot_lasx.S +ZDOTKERNEL = cdot_lasx.S SSCALKERNEL = scal_lasx.S DSCALKERNEL = scal_lasx.S diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S new file mode 100644 index 000000000..0583e56ea --- /dev/null +++ b/kernel/loongarch64/cdot_lasx.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $xr16 +#define res2 $xr17 +#define res3 $xr18 +#define res4 $xr19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define x1 $xr20 +#define x2 $xr21 +#define x3 $xr22 +#define x4 $xr23 + + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v res3, res3, res3 + xvxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + addi.d X, X, 8 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + xvld VX1, X, 8 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + addi.d X, X, 16 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + addi.d Y, Y, 8 * SIZE + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvld VX3, Y, 8 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + addi.d Y, Y, 8 * SIZE + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + xvfmadd.d res1, x1, x3, res1 + xvfmadd.d res2, x2, x3, res2 + xvfmadd.d res3, x1, x4, res3 + xvfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + xvfmadd.s res1, x1, x3, res1 + xvfmadd.s res2, x2, x3, res2 + xvfmadd.s res3, x1, x4, res3 + xvfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + xvpickve.d VX1, res2, 1 + xvpickve.d VX2, res2, 2 + xvpickve.d VX3, res2, 3 + xvfadd.d res2, VX1, res2 + xvfadd.d res2, VX2, res2 + xvfadd.d res2, VX3, res2 + xvpickve.d VX1, res3, 1 + xvpickve.d VX2, res3, 2 + xvpickve.d VX3, res3, 3 + xvfadd.d res3, VX1, res3 + xvfadd.d res3, VX2, res3 + xvfadd.d res3, VX3, res3 + xvpickve.d VX1, res4, 1 + xvpickve.d VX2, res4, 2 + xvpickve.d VX3, res4, 3 + xvfadd.d res4, VX1, res4 + xvfadd.d res4, VX2, res4 + xvfadd.d res4, VX3, res4 +#else + xvpickve.w VX0, res1, 1 + xvpickve.w VX1, res1, 2 + xvpickve.w VX2, res1, 3 + xvpickve.w VX3, res1, 4 + xvpickve.w x1, res1, 5 + xvpickve.w x2, res1, 6 + xvpickve.w x3, res1, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvfadd.s res1, x1, res1 + xvfadd.s res1, x2, res1 + xvfadd.s res1, x3, res1 + xvpickve.w VX0, res2, 1 + xvpickve.w VX1, res2, 2 + xvpickve.w VX2, res2, 3 + xvpickve.w VX3, res2, 4 + xvpickve.w x1, res2, 5 + xvpickve.w x2, res2, 6 + xvpickve.w x3, res2, 7 + xvfadd.s res2, VX0, res2 + xvfadd.s res2, VX1, res2 + xvfadd.s res2, VX2, res2 + xvfadd.s res2, VX3, res2 + xvfadd.s res2, x1, res2 + xvfadd.s res2, x2, res2 + xvfadd.s res2, x3, res2 + xvpickve.w VX0, res3, 1 + xvpickve.w VX1, res3, 2 + xvpickve.w VX2, res3, 3 + xvpickve.w VX3, res3, 4 + xvpickve.w x1, res3, 5 + xvpickve.w x2, res3, 6 + xvpickve.w x3, res3, 7 + xvfadd.s res3, VX0, res3 + xvfadd.s res3, VX1, res3 + xvfadd.s res3, VX2, res3 + xvfadd.s res3, VX3, res3 + xvfadd.s res3, x1, res3 + xvfadd.s res3, x2, res3 + xvfadd.s res3, x3, res3 + xvpickve.w VX0, res4, 1 + xvpickve.w VX1, res4, 2 + xvpickve.w VX2, res4, 3 + xvpickve.w VX3, res4, 4 + xvpickve.w x1, res4, 5 + xvpickve.w x2, res4, 6 + xvpickve.w x3, res4, 7 + xvfadd.s res4, VX0, res4 + xvfadd.s res4, VX1, res4 + xvfadd.s res4, VX2, res4 + xvfadd.s res4, VX3, res4 + xvfadd.s res4, x1, res4 + xvfadd.s res4, x2, res4 + xvfadd.s res4, x3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cdot_lsx.S b/kernel/loongarch64/cdot_lsx.S new file mode 100644 index 000000000..5feea12be --- /dev/null +++ b/kernel/loongarch64/cdot_lsx.S @@ -0,0 +1,397 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r19 +#define TEMP $r10 +#define t1 $r11 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 +#define res4 $vr19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define x1 $vr20 +#define x2 $vr21 +#define x3 $vr22 +#define x4 $vr23 + + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v res3, res3, res3 + vxor.v res4, res4, res4 + bge $r0, N, .L999 + li.d TEMP, 2 * SIZE + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 1 +#else + srai.d I, N, 2 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + .align 3 + +.L111: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L996 + .align 3 + +.L12: + bge $r0, I, .L997 + .align 3 + +.L121: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + addi.d X, X, 4 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + addi.d X, X, 8 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L21: + bge $r0, I, .L997 + .align 3 + +.L211: + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + addi.d Y, Y, 4 * SIZE + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + addi.d Y, Y, 8 * SIZE + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L211 + b .L996 + .align 3 + +.L22: + bge $r0, I, .L997 + .align 3 + +.L222: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + vfmadd.d res1, x1, x3, res1 + vfmadd.d res2, x2, x3, res2 + vfmadd.d res3, x1, x4, res3 + vfmadd.d res4, x2, x4, res4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + vfmadd.s res1, x1, x3, res1 + vfmadd.s res2, x2, x3, res2 + vfmadd.s res3, x1, x4, res3 + vfmadd.s res4, x2, x4, res4 +#endif + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L996: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + vreplvei.d VX1, res2, 1 + vfadd.d res2, VX1, res2 + vreplvei.d VX1, res3, 1 + vfadd.d res3, VX1, res3 + vreplvei.d VX1, res4, 1 + vfadd.d res4, VX1, res4 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 + vreplvei.w VX1, res2, 1 + vreplvei.w VX2, res2, 2 + vreplvei.w VX3, res2, 3 + vfadd.s res2, VX1, res2 + vfadd.s res2, VX2, res2 + vfadd.s res2, VX3, res2 + vreplvei.w VX1, res3, 1 + vreplvei.w VX2, res3, 2 + vreplvei.w VX3, res3, 3 + vfadd.s res3, VX1, res3 + vfadd.s res3, VX2, res3 + vfadd.s res3, VX3, res3 + vreplvei.w VX1, res4, 1 + vreplvei.w VX2, res4, 2 + vreplvei.w VX3, res4, 3 + vfadd.s res4, VX1, res4 + vfadd.s res4, VX2, res4 + vfadd.s res4, VX3, res4 +#endif + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MADD s1, a1, a3, s1 + MADD s2, a2, a3, s2 + MADD s3, a1, a4, s3 + MADD s4, a2, a4, s4 + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 + ADD $f1, s3, s2 +#else + ADD $f0, s1, s4 + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From fbd612f8c4f3eceb16a645c8a5366af35e7b6a2e Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 20:07:58 +0800 Subject: [PATCH 527/718] loongarch64: Add ic/zamin optimization functions. --- common_loongarch64.h | 4 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/icamin_lasx.S | 555 +++++++++++++++++++++++ kernel/loongarch64/icamin_lsx.S | 425 +++++++++++++++++ 5 files changed, 988 insertions(+) create mode 100644 kernel/loongarch64/icamin_lasx.S create mode 100644 kernel/loongarch64/icamin_lsx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index 599b4795c..e581e2e3e 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -133,6 +133,7 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.d #define XVFADD xvfadd.d +#define XVFMUL xvfmul.d #define XVFMADD xvfmadd.d #define XVFMIN xvfmin.d #define XVFMINA xvfmina.d @@ -146,6 +147,7 @@ static inline int WhereAmI(void){ #define VFSUB vfsub.d #define VFADD vfadd.d +#define VFMUL vfmul.d #define VFMADD vfmadd.d #define VFMIN vfmin.d #define VFMINA vfmina.d @@ -185,6 +187,7 @@ static inline int WhereAmI(void){ #define XVFSUB xvfsub.s #define XVFADD xvfadd.s +#define XVFMUL xvfmul.s #define XVFMADD xvfmadd.s #define XVFMIN xvfmin.s #define XVFMINA xvfmina.s @@ -198,6 +201,7 @@ static inline int WhereAmI(void){ #define VFSUB vfsub.s #define VFADD vfadd.s +#define VFMUL vfmul.s #define VFMADD vfmadd.s #define VFMIN vfmin.s #define VFMINA vfmina.s diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c70120f9a..5e2632574 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -38,6 +38,8 @@ IZAMAXKERNEL = icamax_lsx.S ISAMINKERNEL = iamin_lsx.S IDAMINKERNEL = iamin_lsx.S +ICAMINKERNEL = icamin_lsx.S +IZAMINKERNEL = icamin_lsx.S SCOPYKERNEL = copy_lsx.S DCOPYKERNEL = copy_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 98673ae09..20a4d9a7e 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -38,6 +38,8 @@ IZAMAXKERNEL = icamax_lasx.S ISAMINKERNEL = iamin_lasx.S IDAMINKERNEL = iamin_lasx.S +ICAMINKERNEL = icamin_lasx.S +IZAMINKERNEL = icamin_lasx.S SCOPYKERNEL = copy_lasx.S DCOPYKERNEL = copy_lasx.S diff --git a/kernel/loongarch64/icamin_lasx.S b/kernel/loongarch64/icamin_lasx.S new file mode 100644 index 000000000..01abd45b2 --- /dev/null +++ b/kernel/loongarch64/icamin_lasx.S @@ -0,0 +1,555 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $xr9 +#define x2 $xr10 +#define x3 $xr11 +#define x4 $xr12 +#define VX0 $xr13 +#define VX1 $xr14 +#define VM0 $xr15 +#define VM1 $xr16 +#define VINC4 $xr17 +#define VINC8 $xr18 +#define VI0 $xr20 +#define VI1 $xr21 +#define VI2 $xr22 +#define VI3 $xr8 +#define VI4 $xr19 +#define VT0 $xr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 + xvxor.v VI3, VI3, VI3 // 0 + li.d I, -1 + xvreplgr2vr.d VI4, I + xvffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + xvreplve0.w VM0, VM0 + xvxor.v VI3, VI3, VI3 // 0 + li.w I, -1 + xvreplgr2vr.w VI4, I + xvffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE +#ifdef DOUBLE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE + addi.d I, I, -1 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 +#else + xvadd.w VI1, VI1, VINC8 + xvld VX1, X, 8 * SIZE + addi.d I, I, -1 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvfmul.s x3, VI4, x1 + xvfmul.s x4, VI4, x2 + xvfcmp.clt.s VT0, x1, VI3 + xvfcmp.clt.s VINC4, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC4 +#endif + XVFADD x1, x1, x2 + XVFMIN x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmin.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmin.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmin.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 +#endif + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + b .L26 + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 2 //4 + xvreplgr2vr.d VINC4, i0 + addi.d i0, i0, -7 + xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, -1 + xvinsgr2vr.d VI1, i0, 2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI1, i0, 3 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 1 //3 + addi.d i0, i0, -1 + xvinsgr2vr.d VI0, i0, 2 //2 + addi.d i0, i0, 2 + xvinsgr2vr.d VI0, i0, 3 //4 +#else + addi.w i0, i0, 1 + srai.d I, N, 3 + bge $r0, I, .L21 + slli.w i0, i0, 3 //8 + xvreplgr2vr.w VINC8, i0 + addi.w i0, i0, -15 + xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, -3 + xvinsgr2vr.w VI1, i0, 4 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 5 + addi.w i0, i0, 3 + xvinsgr2vr.w VI1, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 2 //5 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 3 //6 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 4 //3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 5 //4 + addi.w i0, i0, 3 + xvinsgr2vr.w VI0, i0, 6 //7 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 //8 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + xvadd.w VI1, VI1, VINC8 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + addi.d I, I, -1 + XVFMUL x3, VI4, x1 + XVFMUL x4, VI4, x2 + XVCMPLT VT0, x1, VI3 + XVCMPLT VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + XVFADD x1, x1, x2 + XVFMIN x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + xvpickve.d VI1, VI0, 0 + xvpickve.d VI2, VI0, 1 + xvpickve.d VI3, VI0, 2 + xvpickve.d VI4, VI0, 3 + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + xvpickve.d x3, VM0, 2 + xvpickve.d x4, VM0, 3 + xvfmina.d VM1, x1, x2 + xvfcmp.ceq.d VT0, VM1, x1 + xvbitsel.v VINC4, VI2, VI1, VT0 + xvfmina.d VM0, x3, x4 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VINC8, VI4, VI3, VT0 + xvfmina.d VM0, VM0, VM1 + xvfcmp.ceq.d VT0, VM0, VM1 +#else + xvxor.v VX0, VX0, VX0 + xvor.v VX0, VI0, VX0 + xvxor.v VX1, VX1, VX1 + xvor.v VX1, VM0, VX1 + xvpickve.w VI1, VI0, 0 + xvpickve.w VI2, VI0, 1 + xvpickve.w VI3, VI0, 2 + xvpickve.w VI4, VI0, 3 + xvpickve.w x1, VM0, 0 + xvpickve.w x2, VM0, 1 + xvpickve.w x3, VM0, 2 + xvpickve.w x4, VM0, 3 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VM1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, VM1 + xvbitsel.v VM0, VM0, VM1, VT0 +#endif + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + XVCMPLT VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L26: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + XVCMPLT VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + XVCMPLT VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + XVCMPLT VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 + +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 +#else + fmov.s $f16, $f20 + .align 3 + +.L252: + xvxor.v VI0, VI0, VI0 + xvor.v VI0, VI0, VX0 + fmov.s $f13, $f15 + xvxor.v VM0, VM0, VM0 + xvor.v VM0, VM0, VX1 + xvpickve.w VI1, VI0, 4 + xvpickve.w VI2, VI0, 5 + xvpickve.w VI3, VI0, 6 + xvpickve.w VI4, VI0, 7 + xvpickve.w x1, VM0, 4 + xvpickve.w x2, VM0, 5 + xvpickve.w x3, VM0, 6 + xvpickve.w x4, VM0, 7 + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v x1, x1, x2, VT0 + xvbitsel.v VINC4, VI1, VI2, VT0 + xvfcmp.clt.s VT0, x3, x4 + xvbitsel.v VM0, x3, x4, VT0 + xvbitsel.v VINC8, VI3, VI4, VT0 + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L262 + xvfcmp.clt.s VT0, VI1, VI0 + xvbitsel.v VI0, VI0, VI1, VT0 + .align 3 + +.L262: + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L272 + xvfcmp.clt.s VT0, VI2, VI0 + xvbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L272: + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L282 + xvfcmp.clt.s VT0, VI3, VI0 + xvbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L282: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L292 + xvfcmp.clt.s VT0, VI4, VI0 + xvbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L292: + fcmp.clt.s $fcc0, $f15, $f13 + fsel $f15, $f15, $f13, $fcc0 + fsel $f20, $f20, $f16, $fcc0 + movfr2gr.s i0, $f20 + +.L21: //N<8 + andi I, N, 7 + bge $r0, I, .L999 + srai.d i1, N, 3 + slli.d i1, i1, 3 +#endif + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMIN a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/icamin_lsx.S b/kernel/loongarch64/icamin_lsx.S new file mode 100644 index 000000000..a08cd33c5 --- /dev/null +++ b/kernel/loongarch64/icamin_lsx.S @@ -0,0 +1,425 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r12 +#define t1 $r13 +#define t2 $r15 +#define t3 $r18 +#define t4 $r16 +#define i0 $r17 +#define i1 $r14 +#define TEMP $r19 +#define a0 $f12 +#define a1 $f13 +#define s1 $f15 +#define x1 $vr9 +#define x2 $vr10 +#define x3 $vr11 +#define x4 $vr12 +#define VX0 $vr13 +#define VX1 $vr14 +#define VM0 $vr15 +#define VM1 $vr16 +#define VINC4 $vr17 +#define VINC8 $vr18 +#define VI0 $vr20 +#define VI1 $vr21 +#define VI2 $vr22 +#define VI3 $vr8 +#define VI4 $vr19 +#define VT0 $vr23 + + PROLOGUE + li.d i0, 0 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 + vreplvei.w VM0, VM0, 0 + vxor.v VI3, VI3, VI3 // 0 +#ifdef DOUBLE + li.d I, -1 + vreplgr2vr.d VI4, I + vffint.d.l VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + li.w I, -1 + vreplgr2vr.w VI4, I + vffint.s.w VI4, VI4 // -1 + bne INCX, TEMP, .L20 + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L10: + vld VX0, X, 0 * SIZE +#ifdef DOUBLE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 2 * SIZE + addi.d I, I, -1 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmin.d x3, VM0, x1 + vfcmp.ceq.d VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + vld VX0, X, 4 * SIZE + vadd.d VI1, VI1, VINC4 + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else + vadd.w VI1, VI1, VINC4 + vld VX1, X, 4 * SIZE + addi.d I, I, -1 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + VFMUL x3, VI4, x1 + VFMUL x4, VI4, x2 + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMIN x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + addi.d X, X, 8 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L10 + .align 3 + +.L15: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 + b .L26 +#endif + .align 3 + +.L20: // INCX!=1 +#ifdef DOUBLE + addi.d i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.d i0, i0, 1 //2 + vreplgr2vr.d VINC4, i0 + addi.d i0, i0, -3 + vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization + addi.d i0, i0, 1 + vinsgr2vr.d VI1, i0, 1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 0 //1 + addi.d i0, i0, 1 + vinsgr2vr.d VI0, i0, 1 //2 +#else + addi.w i0, i0, 1 + srai.d I, N, 2 + bge $r0, I, .L21 + slli.w i0, i0, 2 //4 + vreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 + vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 1 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 2 + addi.w i0, i0, 1 + vinsgr2vr.w VI1, i0, 3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 0 //1 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 1 //2 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 2 //3 + addi.w i0, i0, 1 + vinsgr2vr.w VI0, i0, 3 //4 +#endif + .align 3 + +.L24: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 + vfmul.d x3, VI4, x1 + vfmul.d x4, VI4, x2 + vfcmp.clt.d VT0, x1, VI3 + vfcmp.clt.d VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + vfadd.d x1, x1, x2 + vfmin.d x3, VM0, x1 + ld.d t1, X, 0 * SIZE + vfcmp.ceq.d VT0, x3, VM0 + ld.d t2, X, 1 * SIZE + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vadd.d VI1, VI1, VINC4 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vadd.w VI1, VI1, VINC4 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 +#endif + addi.d I, I, -1 + VFMUL x3, VI4, x1 + VFMUL x4, VI4, x2 + VCMPLT VT0, x1, VI3 + VCMPLT VINC8, x2, VI3 + vbitsel.v x1, x1, x3, VT0 + vbitsel.v x2, x2, x4, VINC8 + VFADD x1, x1, x2 + VFMIN x3, VM0, x1 + VCMPEQ VT0, x3, VM0 + vbitsel.v VM0, x3, VM0, VT0 + vbitsel.v VI0, VI1, VI0, VT0 + blt $r0, I, .L24 + .align 3 + +.L25: +#ifdef DOUBLE + vreplvei.d VI1, VI0, 0 + vreplvei.d VI2, VI0, 1 + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.d VT0, VI1, VI2 + vbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w VI1, VI0, 0 + vreplvei.w VI2, VI0, 1 + vreplvei.w VI3, VI0, 2 + vreplvei.w VI4, VI0, 3 + vreplvei.w x1, VM0, 0 + vreplvei.w x2, VM0, 1 + vreplvei.w x3, VM0, 2 + vreplvei.w x4, VM0, 3 + vfmina.s VM1, x1, x2 + vfcmp.ceq.s VT0, VM1, x1 + vbitsel.v VINC4, VI2, VI1, VT0 + vfmina.s VM0, x3, x4 + vfcmp.ceq.s VT0, x3, VM0 + vbitsel.v VINC8, VI4, VI3, VT0 + vfmina.s VM0, VM0, VM1 + vfcmp.ceq.s VT0, VM0, VM1 + vbitsel.v VI0, VINC8, VINC4, VT0 + fcmp.ceq.d $fcc0, $f15, $f9 + bceqz $fcc0, .L26 + vfcmp.clt.s VT0, VI1, VI0 + vbitsel.v VI0, VI0, VI1, VT0 +#endif + .align 3 + +.L26: +#ifdef DOUBLE + vfmina.d VM0, x1, x2 + vfcmp.ceq.d VT0, x1, VM0 +#else + fcmp.ceq.d $fcc0, $f15, $f10 + bceqz $fcc0, .L27 + vfcmp.clt.s VT0, VI2, VI0 +#endif + vbitsel.v VI0, VI0, VI2, VT0 + .align 3 + +.L27: +#ifdef DOUBLE + movfr2gr.d i0, $f20 + .align 3 +#else + fcmp.ceq.d $fcc0, $f15, $f11 + bceqz $fcc0, .L28 + vfcmp.clt.s VT0, VI3, VI0 + vbitsel.v VI0, VI0, VI3, VT0 + .align 3 + +.L28: + fcmp.ceq.d $fcc0, $f15, $f12 + bceqz $fcc0, .L29 + vfcmp.clt.s VT0, VI4, VI0 + vbitsel.v VI0, VI0, VI4, VT0 + .align 3 + +.L29: + movfr2gr.s i0, $f20 + .align 3 + +#endif +.L21: //N<4 + andi I, N, 3 + bge $r0, I, .L999 + srai.d i1, N, 2 + slli.d i1, i1, 2 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L22: + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + addi.d I, I, -1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 + FMIN a1, s1, a0 + CMPEQ $fcc0, s1, a1 + add.d X, X, INCX + fsel s1, a1, s1, $fcc0 + fsel $f20, $f21, $f20, $fcc0 + addi.d i1, i1, 1 + movgr2fr.d $f21, i1 + blt $r0, I, .L22 + MTG i0, $f20 + .align 3 + +.L999: + move $r4, $r17 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From 3c53ded315901759f0ee2a77d07121c8905fb18d Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 20:26:01 +0800 Subject: [PATCH 528/718] loongarch64: Add c/znrm2 optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/cnrm2_lasx.S | 147 +++++++++++++ kernel/loongarch64/cnrm2_lsx.S | 155 ++++++++++++++ kernel/loongarch64/znrm2_lasx.S | 252 ++++++++++++++++++++++ kernel/loongarch64/znrm2_lsx.S | 260 +++++++++++++++++++++++ 6 files changed, 818 insertions(+) create mode 100644 kernel/loongarch64/cnrm2_lasx.S create mode 100644 kernel/loongarch64/cnrm2_lsx.S create mode 100644 kernel/loongarch64/znrm2_lasx.S create mode 100644 kernel/loongarch64/znrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 5e2632574..826588318 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -70,6 +70,8 @@ DROTKERNEL = rot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S +CNRM2KERNEL = cnrm2_lsx.S +ZNRM2KERNEL = znrm2_lsx.S DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 20a4d9a7e..b61ecd427 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -70,6 +70,8 @@ DROTKERNEL = rot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S +CNRM2KERNEL = cnrm2_lasx.S +ZNRM2KERNEL = znrm2_lasx.S DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S new file mode 100644 index 000000000..3a60069ac --- /dev/null +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -0,0 +1,147 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $xr15 +#define VX1 $xr16 +#define VX2 $xr17 +#define VX3 $xr18 +#define VX4 $xr21 +#define res1 $xr19 +#define res2 $xr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, SIZE + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + .align 3 + b .L996 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfcvtl.d.s VX1, VX0 + xvfcvth.d.s VX2, VX0 + xvfmadd.d res1, VX1, VX1, res1 + xvfmadd.d res2, VX2, VX2, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, res + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S new file mode 100644 index 000000000..20950ba17 --- /dev/null +++ b/kernel/loongarch64/cnrm2_lsx.S @@ -0,0 +1,155 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define a1 $f15 +#define a2 $f16 +#define res $f19 +#define VX0 $vr15 +#define VX1 $vr16 +#define VX2 $vr17 +#define VX3 $vr18 +#define VX4 $vr21 +#define res1 $vr19 +#define res2 $vr20 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L997 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + vld VX0, X, 4 * SIZE + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L996 + .align 3 + +.L20: + bge $r0, I, .L997 + .align 3 + +.L21: + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX1, VX0 + vfcvth.d.s VX2, VX0 + vfmadd.d res1, VX1, VX1, res1 + vfmadd.d res2, VX2, VX2, res2 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfcvtl.d.s VX3, VX0 + vfcvth.d.s VX4, VX0 + vfmadd.d res1, VX3, VX3, res1 + vfmadd.d res2, VX4, VX4, res2 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s a1, a1 + fcvt.d.s a2, a2 + fmadd.d res, a1, a1, res + fmadd.d res, a2, a2, res + add.d X, X, INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d res, res + move $r4, $r17 + fcvt.s.d $f0, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S new file mode 100644 index 000000000..53f8a6e05 --- /dev/null +++ b/kernel/loongarch64/znrm2_lasx.S @@ -0,0 +1,252 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $xr15 +#define VX1 $xr16 +#define VM0 $xr17 +#define VM1 $xr18 +#define VM2 $xr13 +#define VM3 $xr14 +#define res1 $xr19 +#define res2 $xr20 +#define VALPHA $xr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + xvxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfmaxa.d VM1, VX1, VX0 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM0, VM0, VM1 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + xvpickve.d VX0, VM0, 1 + xvpickve.d VX1, VM0, 2 + xvpickve.d VM3, VM0, 3 + xvfmaxa.d VM1, VX0, VX1 + xvfmaxa.d VM2, VM3, VM0 + xvfmaxa.d VM0, VM1, VM2 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + xvreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + xvld VX0, XX, 0 * SIZE + xvld VX1, XX, 4 * SIZE + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfmul.d VM2, VX0, VALPHA + xvfmul.d VM3, VX1, VALPHA + xvfmadd.d res1, VM2, VM2, res1 + xvfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + xvfadd.d res1, res1, res2 + xvpickve.d VX0, res1, 1 + xvpickve.d VX1, res1, 2 + xvpickve.d VM2, res1, 3 + xvfadd.d res1, VX0, res1 + xvfadd.d VX1, VX1, VM2 + xvfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S new file mode 100644 index 000000000..14c59d504 --- /dev/null +++ b/kernel/loongarch64/znrm2_lsx.S @@ -0,0 +1,260 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r19 +#define I $r17 +#define TEMP $r18 +#define t1 $r12 +#define t2 $r13 +#define t3 $r14 +#define t4 $r15 +#define INF $f23 +#define a1 $f22 +#define max $f17 +#define ALPHA $f12 +#define a3 $f15 +#define a2 $f16 +#define VX0 $vr15 +#define VX1 $vr16 +#define VM0 $vr17 +#define VM1 $vr18 +#define VM2 $vr13 +#define VM3 $vr14 +#define res1 $vr19 +#define res2 $vr20 +#define VALPHA $vr21 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + vxor.v VM0, VM0, VM0 + bge $r0, N, .L999 + beq $r0, INCX, .L999 + move XX, X + // Init INF + addi.d TEMP, $r0, 0x7FF + slli.d TEMP, TEMP, 52 + MTC INF, TEMP + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L97 + .align 3 + +.L10: + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfmaxa.d VM1, VX1, VX0 + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vfmaxa.d VM2, VX1, VX0 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L10 + b .L96 + .align 3 + +.L20: // INCX!=1 + bge $r0, I, .L97 + .align 3 + +.L21: + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfmaxa.d VM1, VX0, VX1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmaxa.d VM2, VX0, VX1 + vfmaxa.d VM3, VM1, VM2 + vfmaxa.d VM0, VM0, VM3 + addi.d I, I, -1 + blt $r0, I, .L21 + b .L96 + .align 3 + +.L96: + vreplvei.d VX0, VM0, 0 + vreplvei.d VX1, VM0, 1 + vfmaxa.d VM0, VX0, VX1 + .align 3 + +.L97: + andi I, N, 3 + bge $r0, I, .L99 + .align 3 + +.L98: + fld.d a3, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fmaxa.d a3, a2, a3 + fmaxa.d max, a3, max + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L98 + .align 3 + +.L99: + fabs.d max, max + lu12i.w TEMP, 0x3f800 // 1 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, max, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, max + CMPEQ $fcc0, INF, ALPHA + bcnez $fcc0, .L999 + movfr2gr.d TEMP, ALPHA + vreplgr2vr.d VALPHA, TEMP + +.L100: + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L120 + bge $r0, I, .L997 + .align 3 + +.L110: + vld VX0, XX, 0 * SIZE + vld VX1, XX, 2 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + vld VX0, XX, 4 * SIZE + vld VX1, XX, 6 * SIZE + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d XX, XX, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L110 + b .L996 + .align 3 + +.L120: + bge $r0, I, .L997 + .align 3 + +.L121: + ld.d t1, XX, 0 * SIZE + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + ld.d t1, XX, 0 * SIZE + vfmul.d VM3, VX1, VALPHA + ld.d t2, XX, 1 * SIZE + add.d XX, XX, INCX + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + ld.d t3, XX, 0 * SIZE + ld.d t4, XX, 1 * SIZE + add.d XX, XX, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + vfmul.d VM2, VX0, VALPHA + vfmul.d VM3, VX1, VALPHA + vfmadd.d res1, VM2, VM2, res1 + vfmadd.d res2, VM3, VM3, res2 + addi.d I, I, -1 + blt $r0, I, .L121 + b .L996 + .align 3 + +.L996: + vfadd.d res1, res1, res2 + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 + .align 3 + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: + fld.d a3, XX, 0 * SIZE + fld.d a2, XX, 1 * SIZE + addi.d I, I, -1 + fmul.d a3, a3, ALPHA + fmadd.d $f19, a3, a3, $f19 + fmul.d a2, a2, ALPHA + fmadd.d $f19, a2, a2, $f19 + add.d XX, XX , INCX + blt $r0, I, .L998 + .align 3 + +.L999: + fsqrt.d $f19, $f19 + fmul.d $f0, max, $f19 + jirl $r0, $r1, 0x0 + + EPILOGUE From 1ec5dded43b9d5e14875061f62d5edec480f9584 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 28 Dec 2023 21:23:59 +0800 Subject: [PATCH 529/718] loongarch64: Add c/zrot optimization functions. Signed-off-by: Hao Chen --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/crot_lasx.S | 1079 ++++++++++++++++++++++ kernel/loongarch64/crot_lsx.S | 907 ++++++++++++++++++ 4 files changed, 1990 insertions(+) create mode 100644 kernel/loongarch64/crot_lasx.S create mode 100644 kernel/loongarch64/crot_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 826588318..02ea4304e 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -67,6 +67,8 @@ ZASUMKERNEL = casum_lsx.S SROTKERNEL = rot_lsx.S DROTKERNEL = rot_lsx.S +CROTKERNEL = crot_lsx.S +ZROTKERNEL = crot_lsx.S SNRM2KERNEL = snrm2_lsx.S DNRM2KERNEL = dnrm2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index b61ecd427..462698f85 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -67,6 +67,8 @@ ZASUMKERNEL = casum_lasx.S SROTKERNEL = rot_lasx.S DROTKERNEL = rot_lasx.S +CROTKERNEL = crot_lasx.S +ZROTKERNEL = crot_lasx.S SNRM2KERNEL = snrm2_lasx.S DNRM2KERNEL = dnrm2_lasx.S diff --git a/kernel/loongarch64/crot_lasx.S b/kernel/loongarch64/crot_lasx.S new file mode 100644 index 000000000..d4ec1e22c --- /dev/null +++ b/kernel/loongarch64/crot_lasx.S @@ -0,0 +1,1079 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VT0 $xr10 +#define VT1 $xr18 +#define VXC $xr23 +#define VXS $xr9 +#define VXZ $xr11 +#define x1 $xr12 +#define x2 $xr13 +#define x3 $xr14 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + xvreplgr2vr.d VXC, t1 + xvreplgr2vr.d VXS, t2 + xvreplgr2vr.d VXZ, t3 + srai.d I, N, 2 +#else + xvreplgr2vr.w VXC, t1 + xvreplgr2vr.w VXS, t2 + xvreplgr2vr.w VXZ, t3 + srai.d I, N, 3 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE + xvld VX1, X, 4 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvilvl.d x1, VX2 ,VX0 + xvilvh.d x2, VX2, VX0 + xvilvl.d x3, VX3 ,VX1 + xvilvh.d x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 4 * SIZE + xvst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvld VX1, X, 8 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvilvl.w x1, VX2 ,VX0 + xvilvh.w x2, VX2, VX0 + xvilvl.w x3, VX3 ,VX1 + xvilvh.w x4, VX3, VX1 + xvst x1, X, 0 * SIZE + xvst x3, Y, 0 * SIZE + xvst x2, X, 8 * SIZE + xvst x4, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + xvst VXZ, X, 0 * SIZE + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, X, 4 * SIZE + xvst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, X, 8 * SIZE + xvst VXZ, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvfmul.d VX0, x1, VXC + xvfmadd.d VX0, x3, VXS, VX0 + xvfmul.d VX1, x1, VXS + xvfmsub.d VX1, x3, VXC, VX1 + xvfmul.d VX2, x2, VXC + xvfmadd.d VX2, x4, VXS, VX2 + xvfmul.d VX3, x2, VXS + xvfmsub.d VX3, x4, VXC, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvfmul.s VX0, x1, VXC + xvfmadd.s VX0, x3, VXS, VX0 + xvfmul.s VX1, x1, VXS + xvfmsub.s VX1, x3, VXC, VX1 + xvfmul.s VX2, x2, VXC + xvfmadd.s VX2, x4, VXS, VX2 + xvfmul.s VX3, x2, VXS + xvfmsub.s VX3, x4, VXC, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x1, VXC + xvfmul.d VX1, x3, VXC + xvfmul.d VX2, x2, VXC + xvfmul.d VX3, x4, VXC + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x1, VXC + xvfmul.s VX1, x3, VXC + xvfmul.s VX2, x2, VXC + xvfmul.s VX3, x4, VXC + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, x3, VXS + xvfmul.d VX1, x1, VXS + xvfsub.d VX1, VXZ, VX1 + xvfmul.d VX2, x4, VXS + xvfmul.d VX3, x2, VXS + xvfsub.d VX3, VXZ, VX3 + xvstelm.d VX0, XX, 0, 0 + xvstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 1 + xvstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 2 + xvstelm.d VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.d VX0, XX, 0, 3 + xvstelm.d VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.d VX1, YY, 0, 0 + xvstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 1 + xvstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 2 + xvstelm.d VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d VX1, YY, 0, 3 + xvstelm.d VX3, YY, 1 * SIZE, 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, x3, VXS + xvfmul.s VX1, x1, VXS + xvfsub.s VX1, VXZ, VX1 + xvfmul.s VX2, x4, VXS + xvfmul.s VX3, x2, VXS + xvfsub.s VX3, VXZ, VX3 + xvstelm.w VX0, XX, 0, 0 + xvstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 1 + xvstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 2 + xvstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 3 + xvstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 0 + xvstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 1 + xvstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 2 + xvstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 3 + xvstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w VX0, XX, 0, 4 + xvstelm.w VX2, XX, 1 * SIZE, 4 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 5 + xvstelm.w VX2, XX, 1 * SIZE, 5 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 6 + xvstelm.w VX2, XX, 1 * SIZE, 6 + add.d XX, XX, INCX + xvstelm.w VX0, XX, 0, 7 + xvstelm.w VX2, XX, 1 * SIZE, 7 + add.d XX, XX, INCX + xvstelm.w VX1, YY, 0, 4 + xvstelm.w VX3, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 5 + xvstelm.w VX3, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 6 + xvstelm.w VX3, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w VX1, YY, 0, 7 + xvstelm.w VX3, YY, 1 * SIZE, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, XX, 0, 0 + xvstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d VXZ, YY, 0, 0 + xvstelm.d VXZ, YY, 1 * SIZE, 0 +#else + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, XX, 0, 0 + xvstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 1 * SIZE, 0 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 + .align 3 + +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S new file mode 100644 index 000000000..126257edc --- /dev/null +++ b/kernel/loongarch64/crot_lsx.S @@ -0,0 +1,907 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define C $f0 +#define S $f1 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VT0 $vr10 +#define VT1 $vr18 +#define VXC $vr23 +#define VXS $vr9 +#define VXZ $vr11 +#define x1 $vr12 +#define x2 $vr13 +#define x3 $vr14 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + li.d TEMP, 1 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, C + MTG t2, S + MTG t3, a1 +#ifdef DOUBLE + vreplgr2vr.d VXC, t1 + vreplgr2vr.d VXS, t2 + vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXC, t1 + vreplgr2vr.w VXS, t2 + vreplgr2vr.w VXZ, t3 + srai.d I, N, 2 +#endif + beq INCX, $r0, .L996 + beq INCY, $r0, .L996 + bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L22 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L110 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L112 // C!=0 S==0 + b .L111 // C!=0 S!=0 + .align 3 + +.L110: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L114 // C==0 S==0 + b .L113 // C==0 S!=0 + .align 3 + +.L111: // C!=0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: // C!=0 S==0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: // C==0 S!=0 + vld VX0, X, 0 * SIZE + vld VX2, Y, 0 * SIZE +#ifdef DOUBLE + vld VX1, X, 2 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vilvl.d x1, VX2 ,VX0 + vilvh.d x2, VX2, VX0 + vilvl.d x3, VX3 ,VX1 + vilvh.d x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 2 * SIZE + vst x4, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vilvl.w x1, VX2 ,VX0 + vilvh.w x2, VX2, VX0 + vilvl.w x3, VX3 ,VX1 + vilvh.w x4, VX3, VX1 + vst x1, X, 0 * SIZE + vst x3, Y, 0 * SIZE + vst x2, X, 4 * SIZE + vst x4, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: // C==0 S==0 + vst VXZ, X, 0 * SIZE + vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + vst VXZ, X, 2 * SIZE + vst VXZ, Y, 2 * SIZE + addi.d X, X, 4 * SIZE + addi.d Y, Y, 4 * SIZE +#else + vst VXZ, X, 4 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L22: +#ifdef DOUBLE + srai.d I, N, 2 +#endif + bge $r0, I, .L997 + move YY, Y + move XX, X + CMPEQ $fcc0, C, a1 + bcnez $fcc0, .L220 + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L222 // C!=0 S==0 + b .L221 // C!=0 S!=0 + .align 3 + +.L220: + CMPEQ $fcc0, S, a1 + bcnez $fcc0, .L224 // C==0 S==0 + b .L223 // C==0 S!=0 + .align 3 + +.L221: // C!=0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmadd.d VX0, x3, VXS, VX0 + vfmul.d VX1, x1, VXS + vfmsub.d VX1, x3, VXC, VX1 + vfmul.d VX2, x2, VXC + vfmadd.d VX2, x4, VXS, VX2 + vfmul.d VX3, x2, VXS + vfmsub.d VX3, x4, VXC, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vfmul.s VX0, x1, VXC + vfmadd.s VX0, x3, VXS, VX0 + vfmul.s VX1, x1, VXS + vfmsub.s VX1, x3, VXC, VX1 + vfmul.s VX2, x2, VXC + vfmadd.s VX2, x4, VXS, VX2 + vfmul.s VX3, x2, VXS + vfmsub.s VX3, x4, VXC, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 +#endif + .align 3 + +.L222: // C!=0 S==0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x1, VXC + vfmul.d VX1, x3, VXC + vfmul.d VX2, x2, VXC + vfmul.d VX3, x4, VXC + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x1, VXC + vfmul.s VX1, x3, VXC + vfmul.s VX2, x2, VXC + vfmul.s VX3, x4, VXC + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L222 + b .L997 +#endif + .align 3 + +.L223: // C==0 S!=0 +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, x3, VXS + vfmul.d VX1, x1, VXS + vfsub.d VX1, VXZ, VX1 + vfmul.d VX2, x4, VXS + vfmul.d VX3, x2, VXS + vfsub.d VX3, VXZ, VX3 + vstelm.d VX0, XX, 0, 0 + vstelm.d VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VX0, XX, 0, 1 + vstelm.d VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.d VX1, YY, 0, 0 + vstelm.d VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VX1, YY, 0, 1 + vstelm.d VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L995 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, x3, VXS + vfmul.s VX1, x1, VXS + vfsub.s VX1, VXZ, VX1 + vfmul.s VX2, x4, VXS + vfmul.s VX3, x2, VXS + vfsub.s VX3, VXZ, VX3 + vstelm.w VX0, XX, 0, 0 + vstelm.w VX2, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 1 + vstelm.w VX2, XX, 1 * SIZE, 1 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 2 + vstelm.w VX2, XX, 1 * SIZE, 2 + add.d XX, XX, INCX + vstelm.w VX0, XX, 0, 3 + vstelm.w VX2, XX, 1 * SIZE, 3 + add.d XX, XX, INCX + vstelm.w VX1, YY, 0, 0 + vstelm.w VX3, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + vstelm.w VX3, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + vstelm.w VX3, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 + vstelm.w VX3, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L223 + b .L997 +#endif + .align 3 + +.L224: // C==0 S==0 +#ifdef DOUBLE + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, XX, 0, 0 + vstelm.d VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d VXZ, YY, 0, 0 + vstelm.d VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L995 +#else + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, XX, 0, 0 + vstelm.w VXZ, XX, 1 * SIZE, 0 + add.d XX, XX, INCX + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + vstelm.w VXZ, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + move X, XX + move Y, YY + b .L997 +#endif + .align 3 + +#ifdef DOUBLE + .L995: + andi I, N, 3 + bge $r0, I, .L999 + b .L998 + .align 3 + +#endif +.L996: + move I, N + b .L998 + .align 3 + +.L997: +#ifdef DOUBLE + andi I, N, 1 +#else + andi I, N, 3 +#endif + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + MUL s1, a1, C + MADD s1, a3, S, s1 + MUL s2, a1, S + MSUB s2, a3, C, s2 + MUL s3, a2, C + MADD s3, a4, S, s3 + MUL s4, a2, S + MSUB s4, a4, C, s4 + addi.d I, I, -1 + ST s1, X, 0 * SIZE + ST s2, Y, 0 * SIZE + ST s3, X, 1 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From edabb936681bff6fea0454aedf064c19c1db217f Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 29 Dec 2023 15:08:10 +0800 Subject: [PATCH 530/718] loongarch64: Refine axpby optimization functions. --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 4 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 4 +- .../{daxpby_lasx.S => axpby_lasx.S} | 475 ++++++++++++- .../loongarch64/{daxpby_lsx.S => axpby_lsx.S} | 503 +++++++++++++- kernel/loongarch64/saxpby_lasx.S | 597 ----------------- kernel/loongarch64/saxpby_lsx.S | 629 ------------------ 6 files changed, 931 insertions(+), 1281 deletions(-) rename kernel/loongarch64/{daxpby_lasx.S => axpby_lasx.S} (55%) rename kernel/loongarch64/{daxpby_lsx.S => axpby_lsx.S} (56%) delete mode 100644 kernel/loongarch64/saxpby_lasx.S delete mode 100644 kernel/loongarch64/saxpby_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 02ea4304e..0fb0bb68f 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -54,8 +54,8 @@ DAXPYKERNEL = axpy_lsx.S CAXPYKERNEL = caxpy_lsx.S ZAXPYKERNEL = caxpy_lsx.S -SAXPBYKERNEL = saxpby_lsx.S -DAXPBYKERNEL = daxpby_lsx.S +SAXPBYKERNEL = axpby_lsx.S +DAXPBYKERNEL = axpby_lsx.S SSUMKERNEL = sum_lsx.S DSUMKERNEL = sum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 462698f85..1a6a04532 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -54,8 +54,8 @@ DAXPYKERNEL = axpy_lasx.S CAXPYKERNEL = caxpy_lasx.S ZAXPYKERNEL = caxpy_lasx.S -SAXPBYKERNEL = saxpby_lasx.S -DAXPBYKERNEL = daxpby_lasx.S +SAXPBYKERNEL = axpby_lasx.S +DAXPBYKERNEL = axpby_lasx.S SSUMKERNEL = sum_lasx.S DSUMKERNEL = sum_lasx.S diff --git a/kernel/loongarch64/daxpby_lasx.S b/kernel/loongarch64/axpby_lasx.S similarity index 55% rename from kernel/loongarch64/daxpby_lasx.S rename to kernel/loongarch64/axpby_lasx.S index 4b19703e7..f1d99cd3b 100644 --- a/kernel/loongarch64/daxpby_lasx.S +++ b/kernel/loongarch64/axpby_lasx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define ALPHA $f0 #define X $r5 @@ -32,16 +59,22 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + ffint.s.l a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE xvreplgr2vr.d VXA, t1 - movfr2gr.d t2, BETA xvreplgr2vr.d VXB, t2 - movfr2gr.d t3, a1 xvreplgr2vr.d VXZ, t3 +#else + xvreplgr2vr.w VXA, t1 + xvreplgr2vr.w VXB, t2 + xvreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -52,21 +85,22 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 b .L111 // ALPHA!=0 BETA!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L114 // ALPHA==0 BETA==0 b .L113 // ALPHA==0 BETA!=0 .align 3 .L111: // ALPHA!=0 BETA!=0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX2, Y, 0 * SIZE xvld VX1, X, 4 * SIZE xvld VX3, Y, 4 * SIZE @@ -77,6 +111,13 @@ addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvfmadd.s VX2, VX2, VXB, VX0 + xvst VX2, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE blt $r0, I, .L111 @@ -85,34 +126,46 @@ .L112: // ALPHA!=0 BETA==0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvfmul.d VX0, VX0, VXA xvfmul.d VX1, VX1, VXA xvst VX0, Y, 0 * SIZE xvst VX1, Y, 4 * SIZE +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 blt $r0, I, .L112 b .L997 .align 3 .L113: // ALPHA==0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX3, Y, 4 * SIZE xvfmul.d VX2, VX2, VXB xvfmul.d VX3, VX3, VXB xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif addi.d I, I, -1 + addi.d Y, Y, 8 * SIZE blt $r0, I, .L113 b .L997 .align 3 .L114: // ALPHA==0 BETA==0 xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L114 @@ -122,21 +175,22 @@ .L12: // INCX==1 and INCY!=1 bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 b .L121 // ALPHA!=0 BETA!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L124 // ALPHA==0 BETA==0 b .L123 // ALPHA==0 BETA!=0 .align 3 .L121: // ALPHA!=0 BETA!=0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -182,14 +236,59 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L121 b .L997 .align 3 .L122: // ALPHA!=0 BETA==0 xvld VX0, X, 0 * SIZE +#ifdef DOUBLE xvld VX1, X, 4 * SIZE xvfmul.d VX0, VX0, VXA xvfmul.d VX1, VX1, VXA @@ -208,14 +307,33 @@ xvstelm.d VX1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX1, YY, 0, 3 +#else + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE - addi.d I, I, -1 blt $r0, I, .L122 b .L997 .align 3 .L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -250,7 +368,6 @@ xvstelm.d VX2, YY, 0, 3 add.d YY, YY, INCY xvfmul.d VX3, VX3, VXB - addi.d I, I, -1 xvstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 1 @@ -258,12 +375,56 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif + add.d YY, YY, INCY + addi.d I, I, -1 blt $r0, I, .L123 b .L997 .align 3 .L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 1 @@ -279,6 +440,23 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L124 @@ -287,21 +465,22 @@ .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 b .L211 // ALPHA!=0 BETA!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L214 // ALPHA==0 BETA==0 b .L213 // ALPHA==0 BETA!=0 .align 3 .L211: // ALPHA!=0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -334,12 +513,43 @@ xvfmadd.d VX3, VX3, VXB, VX1 addi.d I, I, -1 xvst VX3, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 b .L997 .align 3 .L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -369,6 +579,35 @@ xvfmul.d VX1, VX1, VXA addi.d I, I, -1 xvst VX1, Y, 4 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VXA, VX0 + addi.d I, I, -1 + xvst VX0, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L212 b .L997 @@ -376,20 +615,27 @@ .L213: // ALPHA==0 BETA!=0 xvld VX2, Y, 0 * SIZE +#ifdef DOUBLE xvld VX3, Y, 4 * SIZE xvfmul.d VX2, VX2, VXB xvfmul.d VX3, VX3, VXB - addi.d I, I, -1 xvst VX2, Y, 0 * SIZE xvst VX3, Y, 4 * SIZE +#else + xvfmul.s VX2, VX2, VXB + xvst VX2, Y, 0 * SIZE +#endif addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L213 b .L997 .align 3 .L214: // ALPHA==0 BETA==0 xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE xvst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L214 @@ -399,20 +645,21 @@ .L22: bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 b .L221 // ALPHA!=0 BETA!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L224 // ALPHA==0 BETA==0 b .L223 // ALPHA==0 BETA!=0 .align 3 .L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -481,12 +728,81 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX0, VX0, VXA + xvfmadd.s VX2, VX2, VXB, VX0 + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L221 b .L997 .align 3 .L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -529,12 +845,56 @@ xvstelm.d VX1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX1, YY, 0, 3 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvfmul.s VX0, VX0, VXA + addi.d I, I, -1 + xvstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX0, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L222 b .L997 .align 3 .L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -577,12 +937,56 @@ xvstelm.d VX3, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VX3, YY, 0, 3 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvfmul.s VX2, VX2, VXB + addi.d I, I, -1 + xvstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VX2, YY, 0, 7 +#endif add.d YY, YY, INCY blt $r0, I, .L223 b .L997 .align 3 .L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE xvstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 1 @@ -598,6 +1002,23 @@ xvstelm.d VXZ, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VXZ, YY, 0, 3 +#else + xvstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 4 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 5 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 6 + add.d YY, YY, INCY + xvstelm.w VXZ, YY, 0, 7 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 @@ -610,12 +1031,12 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE addi.d I, I, -1 - fmul.d $f12, $f12, ALPHA - fmadd.d $f13, $f13, BETA, $f12 - fst.d $f13, Y, 0 * SIZE + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 diff --git a/kernel/loongarch64/daxpby_lsx.S b/kernel/loongarch64/axpby_lsx.S similarity index 56% rename from kernel/loongarch64/daxpby_lsx.S rename to kernel/loongarch64/axpby_lsx.S index 9aafbaf2a..45154c262 100644 --- a/kernel/loongarch64/daxpby_lsx.S +++ b/kernel/loongarch64/axpby_lsx.S @@ -1,6 +1,33 @@ -#define ASSEMBLER +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER #include "common.h" + #define N $r4 #define ALPHA $f0 #define X $r5 @@ -32,16 +59,22 @@ bge $r0, N, .L999 li.d TEMP, 1 movgr2fr.d a1, $r0 - ffint.d.l a1, a1 + ffint.s.l a1, a1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT - movfr2gr.d t1, ALPHA + MTG t1, ALPHA + MTG t2, BETA + MTG t3, a1 +#ifdef DOUBLE vreplgr2vr.d VXA, t1 - movfr2gr.d t2, BETA vreplgr2vr.d VXB, t2 - movfr2gr.d t3, a1 vreplgr2vr.d VXZ, t3 +#else + vreplgr2vr.w VXA, t1 + vreplgr2vr.w VXB, t2 + vreplgr2vr.w VXZ, t3 +#endif srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 @@ -52,15 +85,15 @@ .L11: bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L110 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 b .L111 // ALPHA!=0 BETA!=0 .align 3 .L110: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L114 // ALPHA==0 BETA==0 b .L113 // ALPHA==0 BETA!=0 .align 3 @@ -68,6 +101,7 @@ .L111: // ALPHA!=0 BETA!=0 vld VX0, X, 0 * SIZE vld VX2, Y, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vld VX3, Y, 2 * SIZE vfmul.d VX0, VX0, VXA @@ -86,6 +120,16 @@ vfmadd.d VX3, VX3, VXB, VX1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vfmadd.s VX2, VX2, VXB, VX0 + vfmadd.s VX3, VX3, VXB, VX1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -95,6 +139,7 @@ .L112: // ALPHA!=0 BETA==0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vfmul.d VX0, VX0, VXA vfmul.d VX1, VX1, VXA @@ -106,6 +151,13 @@ vfmul.d VX3, VX3, VXA vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif addi.d X, X, 8 * SIZE addi.d Y, Y, 8 * SIZE addi.d I, I, -1 @@ -113,7 +165,8 @@ b .L997 .align 3 -.L113: // ALPHA==0 BETA!=0\ +.L113: // ALPHA==0 BETA!=0 +#ifdef DOUBLE vld VX0, Y, 0 * SIZE vld VX1, Y, 2 * SIZE vfmul.d VX0, VX0, VXB @@ -126,6 +179,14 @@ vfmul.d VX3, VX3, VXB vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L113 @@ -134,9 +195,13 @@ .L114: // ALPHA==0 BETA==0 vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE vst VXZ, Y, 2 * SIZE vst VXZ, Y, 4 * SIZE vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L114 @@ -146,21 +211,22 @@ .L12: // INCX==1 and INCY!=1 bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L120 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 b .L121 // ALPHA!=0 BETA!=0 .align 3 .L120: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L124 // ALPHA==0 BETA==0 b .L123 // ALPHA==0 BETA!=0 .align 3 .L121: // ALPHA!=0 BETA!=0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -212,6 +278,53 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + vld VX1, X, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE blt $r0, I, .L121 @@ -220,6 +333,7 @@ .L122: // ALPHA!=0 BETA==0 vld VX0, X, 0 * SIZE +#ifdef DOUBLE vld VX1, X, 2 * SIZE vfmul.d VX0, VX0, VXA vfmul.d VX1, VX1, VXA @@ -242,6 +356,26 @@ vstelm.d VX1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX1, YY, 0, 1 +#else + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VXA + vfmul.s VX1, VX1, VXA + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d X, X, 8 * SIZE addi.d I, I, -1 @@ -250,6 +384,7 @@ .align 3 .L123: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -294,12 +429,57 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L123 b .L997 .align 3 .L124: // ALPHA==0 BETA==0 +#ifdef DOUBLE vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 @@ -315,6 +495,23 @@ vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L124 @@ -323,21 +520,22 @@ .L21:// INCX!=1 and INCY==1 bge $r0, I, .L997 - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L210 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 b .L211 // ALPHA!=0 BETA!=0 .align 3 .L210: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L214 // ALPHA==0 BETA==0 b .L213 // ALPHA==0 BETA!=0 .align 3 .L211: // ALPHA!=0 BETA!=0 vld VX2, Y, 0 * SIZE +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -378,12 +576,47 @@ vfmadd.d VX3, VX3, VXB, VX1 addi.d I, I, -1 vst VX3, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + vld VX3, Y, 4 * SIZE + vfmadd.s VX2, VX2, VXB, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX2, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + vfmadd.s VX3, VX3, VXB, VX1 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L211 b .L997 .align 3 .L212: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -417,6 +650,37 @@ vfmul.d VX1, VX1, VXA addi.d I, I, -1 vst VX1, Y, 6 * SIZE +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VXA, VX0 + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vst VX0, Y, 0 * SIZE + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vst VX1, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE blt $r0, I, .L212 b .L997 @@ -424,6 +688,7 @@ .L213: // ALPHA==0 BETA!=0 vld VX2, Y, 0 * SIZE +#ifdef DOUBLE vld VX3, Y, 2 * SIZE vfmul.d VX2, VX2, VXB vfmul.d VX3, VX3, VXB @@ -433,19 +698,30 @@ vld VX3, Y, 6 * SIZE vfmul.d VX2, VX2, VXB vfmul.d VX3, VX3, VXB - addi.d I, I, -1 vst VX2, Y, 4 * SIZE vst VX3, Y, 6 * SIZE +#else + vld VX3, Y, 4 * SIZE + vfmul.s VX2, VX2, VXB + vfmul.s VX3, VX3, VXB + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 blt $r0, I, .L213 b .L997 .align 3 .L214: // ALPHA==0 BETA==0 vst VXZ, Y, 0 * SIZE +#ifdef DOUBLE vst VXZ, Y, 2 * SIZE vst VXZ, Y, 4 * SIZE vst VXZ, Y, 6 * SIZE +#else + vst VXZ, Y, 4 * SIZE +#endif addi.d Y, Y, 8 * SIZE addi.d I, I, -1 blt $r0, I, .L214 @@ -455,20 +731,21 @@ .L22: bge $r0, I, .L997 move YY, Y - fcmp.ceq.d $fcc0, ALPHA, a1 + CMPEQ $fcc0, ALPHA, a1 bcnez $fcc0, .L220 - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 b .L221 // ALPHA!=0 BETA!=0 .align 3 .L220: - fcmp.ceq.d $fcc0, BETA, a1 + CMPEQ $fcc0, BETA, a1 bcnez $fcc0, .L224 // ALPHA==0 BETA==0 b .L223 // ALPHA==0 BETA!=0 .align 3 .L221: // ALPHA!=0 BETA!=0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -541,12 +818,83 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vfmadd.s VX2, VX2, VXB, VX0 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vfmadd.s VX3, VX3, VXB, VX1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L221 b .L997 .align 3 .L222: // ALPHA!=0 BETA==0 +#ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -591,12 +939,57 @@ vstelm.d VX1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX1, YY, 0, 1 +#else + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vfmul.s VX0, VX0, VXA + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + add.d X, X, INCX + ld.w t4, X, 0 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vstelm.w VX0, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX0, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX1, VX1, VXA + addi.d I, I, -1 + vstelm.w VX1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX1, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L222 b .L997 .align 3 .L223: // ALPHA==0 BETA!=0 +#ifdef DOUBLE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -641,12 +1034,57 @@ vstelm.d VX3, YY, 0, 0 add.d YY, YY, INCY vstelm.d VX3, YY, 0, 1 +#else + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vfmul.s VX2, VX2, VXB + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vstelm.w VX2, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX2, YY, 0, 3 + add.d YY, YY, INCY + vfmul.s VX3, VX3, VXB + addi.d I, I, -1 + vstelm.w VX3, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VX3, YY, 0, 3 +#endif add.d YY, YY, INCY blt $r0, I, .L223 b .L997 .align 3 .L224: // ALPHA==0 BETA==0 +#ifdef DOUBLE vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 @@ -662,6 +1100,23 @@ vstelm.d VXZ, YY, 0, 0 add.d YY, YY, INCY vstelm.d VXZ, YY, 0, 1 +#else + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 0 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 1 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 2 + add.d YY, YY, INCY + vstelm.w VXZ, YY, 0, 3 +#endif add.d YY, YY, INCY addi.d I, I, -1 blt $r0, I, .L224 @@ -674,12 +1129,12 @@ .align 3 .L998: - fld.d $f12, X, 0 * SIZE - fld.d $f13, Y, 0 * SIZE + LD $f12, X, 0 * SIZE + LD $f13, Y, 0 * SIZE addi.d I, I, -1 - fmul.d $f12, $f12, ALPHA - fmadd.d $f13, $f13, BETA, $f12 - fst.d $f13, Y, 0 * SIZE + MUL $f12, $f12, ALPHA + MADD $f13, $f13, BETA, $f12 + ST $f13, Y, 0 * SIZE add.d X, X, INCX add.d Y, Y, INCY blt $r0, I, .L998 diff --git a/kernel/loongarch64/saxpby_lasx.S b/kernel/loongarch64/saxpby_lasx.S deleted file mode 100644 index c5d1ff402..000000000 --- a/kernel/loongarch64/saxpby_lasx.S +++ /dev/null @@ -1,597 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define ALPHA $f0 -#define X $r5 -#define INCX $r6 -#define BETA $f1 -#define Y $r7 -#define INCY $r8 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define a2 $f13 -#define VX0 $xr8 -#define VX1 $xr20 -#define VX2 $xr21 -#define VX3 $xr22 -#define VXA $xr23 -#define VXB $xr9 -#define VXZ $xr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - xvreplgr2vr.w VXA, t1 - movfr2gr.s t2, BETA - xvreplgr2vr.w VXB, t2 - movfr2gr.s t3, a1 - xvreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 - b .L111 // ALPHA!=0 BETA!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L114 // ALPHA==0 BETA==0 - b .L113 // ALPHA==0 BETA!=0 - .align 3 - -.L111: // ALPHA!=0 BETA!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvfmadd.s VX2, VX2, VXB, VX0 - xvst VX2, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // ALPHA!=0 BETA==0 - xvld VX0, X, 0 * SIZE - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvst VX0, Y, 0 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // ALPHA==0 BETA!=0 - xvld VX2, Y, 0 * SIZE - xvfmul.s VX2, VX2, VXB - addi.d I, I, -1 - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // ALPHA==0 BETA==0 - xvst VXZ, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 - b .L121 // ALPHA!=0 BETA!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L124 // ALPHA==0 BETA==0 - b .L123 // ALPHA==0 BETA!=0 - .align 3 - -.L121: // ALPHA!=0 BETA!=0 - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX0, VX0, VXA - xvfmadd.s VX2, VX2, VXB, VX0 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // ALPHA!=0 BETA==0 - xvld VX0, X, 0 * SIZE - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 7 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX2, VX2, VXB - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // ALPHA==0 BETA==0 - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 - b .L211 // ALPHA!=0 BETA!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L214 // ALPHA==0 BETA==0 - b .L213 // ALPHA==0 BETA!=0 - .align 3 - -.L211: // ALPHA!=0 BETA!=0 - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VX0, VXA, VX0 - xvfmadd.s VX2, VX2, VXB, VX0 - addi.d I, I, -1 - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VX0, VXA, VX0 - addi.d I, I, -1 - xvst VX0, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // ALPHA==0 BETA!=0 - xvld VX2, Y, 0 * SIZE - xvfmul.s VX2, VX2, VXB - xvst VX2, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // ALPHA==0 BETA==0 - xvst VXZ, Y, 0 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L214 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 - b .L221 // ALPHA!=0 BETA!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L224 // ALPHA==0 BETA==0 - b .L223 // ALPHA==0 BETA!=0 - .align 3 - -.L221: // ALPHA!=0 BETA!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX0, VX0, VXA - xvfmadd.s VX2, VX2, VXB, VX0 - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VX0, VX0, VXA - addi.d I, I, -1 - xvstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX0, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VX2, VX2, VXB - addi.d I, I, -1 - xvstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VX2, YY, 0, 7 - add.d YY, YY, INCY - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // ALPHA==0 BETA==0 - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - addi.d I, I, -1 - fmul.s $f12, $f12, ALPHA - fmadd.s $f13, $f13, BETA, $f12 - fst.s $f13, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE diff --git a/kernel/loongarch64/saxpby_lsx.S b/kernel/loongarch64/saxpby_lsx.S deleted file mode 100644 index 7f8cea2dd..000000000 --- a/kernel/loongarch64/saxpby_lsx.S +++ /dev/null @@ -1,629 +0,0 @@ -#define ASSEMBLER - -#include "common.h" -#define N $r4 -#define ALPHA $f0 -#define X $r5 -#define INCX $r6 -#define BETA $f1 -#define Y $r7 -#define INCY $r8 - -#define I $r12 -#define TEMP $r13 -#define t1 $r14 -#define t2 $r16 -#define t3 $r15 -#define t4 $r17 -#define XX $r18 -#define YY $r19 -#define a1 $f12 -#define a2 $f13 -#define VX0 $vr8 -#define VX1 $vr20 -#define VX2 $vr21 -#define VX3 $vr22 -#define VXA $vr23 -#define VXB $vr9 -#define VXZ $vr19 - - PROLOGUE - - bge $r0, N, .L999 - li.d TEMP, 1 - movgr2fr.d a1, $r0 - ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - movfr2gr.s t1, ALPHA - vreplgr2vr.w VXA, t1 - movfr2gr.s t2, BETA - vreplgr2vr.w VXB, t2 - movfr2gr.s t3, a1 - vreplgr2vr.w VXZ, t3 - srai.d I, N, 3 - bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 -.L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L110 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L112 // ALPHA!=0 BETA==0 - b .L111 // ALPHA!=0 BETA!=0 - .align 3 - -.L110: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L114 // ALPHA==0 BETA==0 - b .L113 // ALPHA==0 BETA!=0 - .align 3 - -.L111: // ALPHA!=0 BETA!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VX0, VX0, VXA - vfmul.s VX1, VX1, VXA - vfmadd.s VX2, VX2, VXB, VX0 - vfmadd.s VX3, VX3, VXB, VX1 - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L112: // ALPHA!=0 BETA==0 - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VX0, VX0, VXA - vfmul.s VX1, VX1, VXA - vst VX0, Y, 0 * SIZE - vst VX1, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // ALPHA==0 BETA!=0 - vld VX2, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VX2, VX2, VXB - vfmul.s VX3, VX3, VXB - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // ALPHA==0 BETA==0 - vst VXZ, Y, 0 * SIZE - vst VXZ, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L120 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L122 // ALPHA!=0 BETA==0 - b .L121 // ALPHA!=0 BETA!=0 - .align 3 - -.L120: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L124 // ALPHA==0 BETA==0 - b .L123 // ALPHA==0 BETA!=0 - .align 3 - -.L121: // ALPHA!=0 BETA!=0 - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX0, VX0, VXA - vld VX1, X, 4 * SIZE - vfmadd.s VX2, VX2, VXB, VX0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX1, VX1, VXA - vfmadd.s VX3, VX3, VXB, VX1 - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // ALPHA!=0 BETA==0 - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - vfmul.s VX0, VX0, VXA - vfmul.s VX1, VX1, VXA - vstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 3 - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX2, VX2, VXB - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX3, VX3, VXB - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // ALPHA==0 BETA==0 - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L124 - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L210 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L212 // ALPHA!=0 BETA==0 - b .L211 // ALPHA!=0 BETA!=0 - .align 3 - -.L210: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L214 // ALPHA==0 BETA==0 - b .L213 // ALPHA==0 BETA!=0 - .align 3 - -.L211: // ALPHA!=0 BETA!=0 - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VX0, VXA, VX0 - vld VX3, Y, 4 * SIZE - vfmadd.s VX2, VX2, VXB, VX0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vst VX2, Y, 0 * SIZE - vfmul.s VX1, VX1, VXA - vfmadd.s VX3, VX3, VXB, VX1 - addi.d I, I, -1 - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VX0, VXA, VX0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vst VX0, Y, 0 * SIZE - vfmul.s VX1, VX1, VXA - addi.d I, I, -1 - vst VX1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // ALPHA==0 BETA!=0 - vld VX2, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - vfmul.s VX2, VX2, VXB - vfmul.s VX3, VX3, VXB - vst VX2, Y, 0 * SIZE - vst VX3, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // ALPHA==0 BETA==0 - vst VXZ, Y, 0 * SIZE - vst VXZ, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L214 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - fcmp.ceq.s $fcc0, ALPHA, a1 - bcnez $fcc0, .L220 - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L222 // ALPHA!=0 BETA==0 - b .L221 // ALPHA!=0 BETA!=0 - .align 3 - -.L220: - fcmp.ceq.s $fcc0, BETA, a1 - bcnez $fcc0, .L224 // ALPHA==0 BETA==0 - b .L223 // ALPHA==0 BETA!=0 - .align 3 - -.L221: // ALPHA!=0 BETA!=0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX0, VX0, VXA - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vfmadd.s VX2, VX2, VXB, VX0 - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - vfmul.s VX1, VX1, VXA - addi.d I, I, -1 - vfmadd.s VX3, VX3, VXB, VX1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L221 - b .L997 - .align 3 - -.L222: // ALPHA!=0 BETA==0 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - add.d X, X, INCX - vfmul.s VX0, VX0, VXA - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - vstelm.w VX0, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX0, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX1, VX1, VXA - addi.d I, I, -1 - vstelm.w VX1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // ALPHA==0 BETA!=0 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - vfmul.s VX2, VX2, VXB - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - vstelm.w VX2, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX2, YY, 0, 3 - add.d YY, YY, INCY - vfmul.s VX3, VX3, VXB - addi.d I, I, -1 - vstelm.w VX3, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VX3, YY, 0, 3 - add.d YY, YY, INCY - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // ALPHA==0 BETA==0 - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - b .L997 - .align 3 - -.L997: - andi I, N, 7 - bge $r0, I, .L999 - .align 3 - -.L998: - fld.s $f12, X, 0 * SIZE - fld.s $f13, Y, 0 * SIZE - addi.d I, I, -1 - fmul.s $f12, $f12, ALPHA - fmadd.s $f13, $f13, BETA, $f12 - fst.s $f13, Y, 0 * SIZE - add.d X, X, INCX - add.d Y, Y, INCY - blt $r0, I, .L998 - .align 3 - -.L999: - move $r4, $r12 - jirl $r0, $r1, 0x0 - .align 3 - - EPILOGUE From 546f13558caad680fdd82778c106ebaa3bd7331c Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 29 Dec 2023 11:03:53 +0800 Subject: [PATCH 531/718] loongarch64: Add {c/z}swap and {c/z}sum optimization --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 6 + kernel/loongarch64/csum_lasx.S | 274 +++++++++++++++ kernel/loongarch64/csum_lsx.S | 266 ++++++++++++++ kernel/loongarch64/cswap_lasx.S | 394 +++++++++++++++++++++ kernel/loongarch64/cswap_lsx.S | 421 +++++++++++++++++++++++ 6 files changed, 1367 insertions(+) create mode 100644 kernel/loongarch64/csum_lasx.S create mode 100644 kernel/loongarch64/csum_lsx.S create mode 100644 kernel/loongarch64/cswap_lasx.S create mode 100644 kernel/loongarch64/cswap_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 0fb0bb68f..01f8e4782 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -75,6 +75,12 @@ DNRM2KERNEL = dnrm2_lsx.S CNRM2KERNEL = cnrm2_lsx.S ZNRM2KERNEL = znrm2_lsx.S +CSWAPKERNEL = cswap_lsx.S +ZSWAPKERNEL = cswap_lsx.S + +CSUMKERNEL = csum_lsx.S +ZSUMKERNEL = csum_lsx.S + DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S DGEMMITCOPY = dgemm_tcopy_8_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 1a6a04532..a9e8abaf0 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -75,6 +75,12 @@ DNRM2KERNEL = dnrm2_lasx.S CNRM2KERNEL = cnrm2_lasx.S ZNRM2KERNEL = znrm2_lasx.S +CSWAPKERNEL = cswap_lasx.S +ZSWAPKERNEL = cswap_lasx.S + +CSUMKERNEL = csum_lasx.S +ZSUMKERNEL = csum_lasx.S + DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S diff --git a/kernel/loongarch64/csum_lasx.S b/kernel/loongarch64/csum_lasx.S new file mode 100644 index 000000000..3e65f2c15 --- /dev/null +++ b/kernel/loongarch64/csum_lasx.S @@ -0,0 +1,274 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 +#define res1 $xr16 +#define res2 $xr17 + PROLOGUE + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + xvld VX2, X, 8 * SIZE + xvld VX3, X, 12 * SIZE + xvfadd.d res2, VX2, VX3 + xvfadd.d res1, res1, res2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d X, X, 16 * SIZE + addi.d I, I, -1 + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + xvfadd.d res2, VX0, VX1 + xvfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w VX1, t1, 4 + xvinsgr2vr.w VX1, t2, 5 + xvinsgr2vr.w VX1, t3, 6 + xvinsgr2vr.w VX1, t4, 7 + xvfadd.s res2, VX0, VX1 + xvfadd.s res1, res2, res1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + xvpickve.d VX1, res1, 1 + xvpickve.d VX2, res1, 2 + xvpickve.d VX3, res1, 3 + xvfadd.d res1, VX1, res1 + xvfadd.d res1, VX2, res1 + xvfadd.d res1, VX3, res1 +#else + xvfadd.s res2, res1, res2 + xvpickve.w VX1, res1, 1 + xvpickve.w VX2, res1, 2 + xvpickve.w VX3, res1, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvpickve.w VX0, res2, 4 + xvpickve.w VX1, res2, 5 + xvpickve.w VX2, res2, 6 + xvpickve.w VX3, res2, 7 + xvfadd.s res1, VX0, res1 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX2, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/csum_lsx.S b/kernel/loongarch64/csum_lsx.S new file mode 100644 index 000000000..8de8e27ca --- /dev/null +++ b/kernel/loongarch64/csum_lsx.S @@ -0,0 +1,266 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define t1 $r15 +#define t2 $r12 +#define t3 $r13 +#define t4 $r14 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 +#define res1 $vr16 +#define res2 $vr17 +#define res3 $vr18 + PROLOGUE + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + bge $r0, N, .L999 + bge $r0, INCX, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L13 + .align 3 + +.L11: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 4 * SIZE + vld VX3, X, 6 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 + vld VX0, X, 8 * SIZE + vld VX1, X, 10 * SIZE + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + vld VX2, X, 12 * SIZE + vld VX3, X, 14 * SIZE + vfadd.d res2, VX2, VX3 + vfadd.d res1, res1, res2 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfadd.s res2, VX0, VX1 + vld VX2, X, 8 * SIZE + vld VX3, X, 12 * SIZE + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + + addi.d I, I, -1 + addi.d X, X, 16 * SIZE + blt $r0, I, .L11 + .align 3 + +.L12: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L13: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L14: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L14 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L23 + .align 3 + +.L21: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + vinsgr2vr.d VX1, t1, 0 + vinsgr2vr.d VX1, t2, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d VX0, t3, 0 + vinsgr2vr.d VX0, t4, 1 + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vfadd.d res2, VX0, VX1 + vfadd.d res1, res1, res2 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + vfadd.s res2, VX0, VX1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + vfadd.s res3, VX2, VX3 + vfadd.s res2, res3, res2 + vfadd.s res1, res1, res2 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L22: +#ifdef DOUBLE + vreplvei.d VX1, res1, 1 + vfadd.d res1, VX1, res1 +#else + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 +#endif + .align 3 + +.L23: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L24: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + ADD a1, a1, a2 + ADD s1, a1, s1 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L24 + .align 3 + +.L999: + fmov.s $f0, $f16 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lasx.S b/kernel/loongarch64/cswap_lasx.S new file mode 100644 index 000000000..d53773d5a --- /dev/null +++ b/kernel/loongarch64/cswap_lasx.S @@ -0,0 +1,394 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $xr12 +#define VX1 $xr13 +#define VX2 $xr14 +#define VX3 $xr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX3, X, 4 * SIZE + xvst VX0, Y, 0 * SIZE + xvst VX1, Y, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + xvld VX2, Y, 0 * SIZE + xvst VX2, X, 0 * SIZE + xvst VX0, Y, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX0, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + xvstelm.d VX1, Y, 0 * SIZE, 2 + ld.d t4, Y, 1 * SIZE + xvstelm.d VX1, Y, 1 * SIZE, 3 + xvinsgr2vr.d VX3, t1, 0 + xvinsgr2vr.d VX3, t2, 1 + xvinsgr2vr.d VX3, t3, 2 + xvinsgr2vr.d VX3, t4, 3 + add.d Y, Y, INCY + xvst VX3, X, 4 * SIZE +#else + xvld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 3 + xvinsgr2vr.w VX2, t1, 0 + xvinsgr2vr.w VX2, t2, 1 + xvinsgr2vr.w VX2, t3, 2 + xvinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + ld.w t1, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 4 + ld.w t2, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 5 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + xvstelm.w VX0, Y, 0 * SIZE, 6 + ld.w t4, Y, 1 * SIZE + xvstelm.w VX0, Y, 1 * SIZE, 7 + xvinsgr2vr.w VX2, t1, 4 + xvinsgr2vr.w VX2, t2, 5 + xvinsgr2vr.w VX2, t3, 6 + xvinsgr2vr.w VX2, t4, 7 + add.d Y, Y, INCY + xvst VX2, X, 0 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21: + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX2, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX2, X, 1 * SIZE, 3 + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + xvstelm.d VX3, X, 0 * SIZE, 2 + ld.d t4, X, 1 * SIZE + xvstelm.d VX3, X, 1 * SIZE, 3 + xvinsgr2vr.d VX1, t1, 0 + xvinsgr2vr.d VX1, t2, 1 + xvinsgr2vr.d VX1, t3, 2 + xvinsgr2vr.d VX1, t4, 3 + add.d X, X, INCX + xvst VX1, Y, 4 * SIZE +#else + xvld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 3 + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + ld.w t1, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 4 + ld.w t2, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 5 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + xvstelm.w VX2, X, 0 * SIZE, 6 + ld.w t4, X, 1 * SIZE + xvstelm.w VX2, X, 1 * SIZE, 7 + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + add.d X, X, INCX + xvst VX0, Y, 0 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/cswap_lsx.S b/kernel/loongarch64/cswap_lsx.S new file mode 100644 index 000000000..62a869066 --- /dev/null +++ b/kernel/loongarch64/cswap_lsx.S @@ -0,0 +1,421 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define t1 $r14 +#define t2 $r15 +#define t3 $r16 +#define t4 $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define b1 $f16 +#define b2 $f17 +#define b3 $f18 +#define b4 $f19 +#define VX0 $vr12 +#define VX1 $vr13 +#define VX2 $vr14 +#define VX3 $vr15 + + + PROLOGUE + bge $r0, N, .L999 + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L112 + .align 3 + +.L111: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 2 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 2 * SIZE + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vst VX2, X, 4 * SIZE + vst VX3, X, 6 * SIZE + vst VX0, Y, 4 * SIZE + vst VX1, Y, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vst VX2, X, 0 * SIZE + vst VX3, X, 4 * SIZE + vst VX0, Y, 0 * SIZE + vst VX1, Y, 4 * SIZE +#endif + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L111 + .align 3 + +.L112: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L113: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L113 + b .L999 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L122 + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 2 * SIZE + vld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + vstelm.d VX0, Y, 0 * SIZE, 0 + ld.d t2, Y, 1 * SIZE + vstelm.d VX0, Y, 1 * SIZE, 1 + vinsgr2vr.d VX2, t1, 0 + vinsgr2vr.d VX2, t2, 1 + add.d Y, Y, INCY + vst VX2, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + vstelm.d VX1, Y, 0 * SIZE, 0 + ld.d t4, Y, 1 * SIZE + vstelm.d VX1, Y, 1 * SIZE, 1 + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + vst VX3, X, 6 * SIZE +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX0, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX0, Y, 1 * SIZE, 3 + vinsgr2vr.w VX2, t1, 0 + vinsgr2vr.w VX2, t2, 1 + vinsgr2vr.w VX2, t3, 2 + vinsgr2vr.w VX2, t4, 3 + add.d Y, Y, INCY + vst VX2, X, 0 * SIZE + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 0 + ld.w t2, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 1 + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + vstelm.w VX1, Y, 0 * SIZE, 2 + ld.w t4, Y, 1 * SIZE + vstelm.w VX1, Y, 1 * SIZE, 3 + vinsgr2vr.w VX3, t1, 0 + vinsgr2vr.w VX3, t2, 1 + vinsgr2vr.w VX3, t3, 2 + vinsgr2vr.w VX3, t4, 3 + add.d Y, Y, INCY + vst VX3, X, 4 * SIZE +#endif + addi.d X, X, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L121 + .align 3 + +.L122: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L123: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L123 + b .L999 + .align 3 + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L212 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 2 * SIZE + vld VX2, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + vstelm.d VX2, X, 0 * SIZE, 0 + ld.d t2, X, 1 * SIZE + vstelm.d VX2, X, 1 * SIZE, 1 + vinsgr2vr.d VX0, t1, 0 + vinsgr2vr.d VX0, t2, 1 + add.d X, X, INCX + vst VX0, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t3, X, 0 * SIZE + vstelm.d VX3, X, 0 * SIZE, 0 + ld.d t4, X, 1 * SIZE + vstelm.d VX3, X, 1 * SIZE, 1 + vinsgr2vr.d VX1, t3, 0 + vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX + vst VX1, Y, 6 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX2, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX2, X, 1 * SIZE, 3 + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 + add.d X, X, INCX + vst VX0, Y, 0 * SIZE + + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 0 + ld.w t2, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 1 + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + vstelm.w VX3, X, 0 * SIZE, 2 + ld.w t4, X, 1 * SIZE + vstelm.w VX3, X, 1 * SIZE, 3 + vinsgr2vr.w VX1, t1, 0 + vinsgr2vr.w VX1, t2, 1 + vinsgr2vr.w VX1, t3, 2 + vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX + vst VX1, Y, 4 * SIZE +#endif + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L211 + .align 3 + +.L212: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L213: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L213 + b .L999 + .align 3 + +.L22: + bge $r0, I, .L223 + .align 3 + move XX, X + +.L222: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + LD a1, X, 0 * SIZE + ST b1, XX, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, X, 0 * SIZE + ST b3, XX, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + + LD b1, Y, 0 * SIZE + ST a1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD b3, Y, 0 * SIZE + ST a3, Y, 0 * SIZE + LD b4, Y, 1 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + + ST b1, XX, 0 * SIZE + ST b2, XX, 1 * SIZE + add.d XX, XX, INCX + ST b3, XX, 0 * SIZE + ST b4, XX, 1 * SIZE + add.d XX, XX, INCX + addi.d I, I, -1 + blt $r0, I, .L222 + .align 3 + +.L223: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L224: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, X, 0 * SIZE + ST a4, X, 1 * SIZE + + addi.d I, I, -1 + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L224 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From a5d0d21378f6da778b18c8f9c5fd4ad33d9b52d5 Mon Sep 17 00:00:00 2001 From: pengxu Date: Fri, 29 Dec 2023 15:10:01 +0800 Subject: [PATCH 532/718] loongarch64: Add zgemm and cgemm optimization --- common_loongarch64.h | 4 + kernel/loongarch64/KERNEL.LOONGSON2K1000 | 12 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 24 +- kernel/loongarch64/cgemm_kernel_2x2_lasx.S | 857 +++++++++++++++++++++ kernel/loongarch64/cgemm_kernel_2x2_lsx.S | 812 +++++++++++++++++++ kernel/loongarch64/cgemm_ncopy_2_lasx.S | 193 +++++ kernel/loongarch64/cgemm_ncopy_2_lsx.S | 202 +++++ kernel/loongarch64/cgemm_tcopy_2_lasx.S | 218 ++++++ kernel/loongarch64/cgemm_tcopy_2_lsx.S | 218 ++++++ kernel/loongarch64/zgemm_kernel_2x2.S | 848 ++++++++++++++++++++ kernel/loongarch64/zgemm_kernel_2x2_lasx.S | 822 ++++++++++++++++++++ kernel/loongarch64/zgemm_ncopy_2_lasx.S | 196 +++++ kernel/loongarch64/zgemm_tcopy_2_lasx.S | 212 +++++ param.h | 8 +- 14 files changed, 4621 insertions(+), 5 deletions(-) create mode 100644 kernel/loongarch64/cgemm_kernel_2x2_lasx.S create mode 100644 kernel/loongarch64/cgemm_kernel_2x2_lsx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_2_lasx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_2_lsx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_2_lasx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_2_lsx.S create mode 100644 kernel/loongarch64/zgemm_kernel_2x2.S create mode 100644 kernel/loongarch64/zgemm_kernel_2x2_lasx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_2_lasx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_2_lasx.S diff --git a/common_loongarch64.h b/common_loongarch64.h index e581e2e3e..b1426da79 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -144,6 +144,7 @@ static inline int WhereAmI(void){ #define XVCMPLT xvfcmp.clt.d #define XVMUL xvfmul.d #define XVMSUB xvfmsub.d +#define XVNMSUB xvfnmsub.d #define VFSUB vfsub.d #define VFADD vfadd.d @@ -158,6 +159,7 @@ static inline int WhereAmI(void){ #define VCMPLT vfcmp.clt.d #define VMUL vfmul.d #define VMSUB vfmsub.d +#define VNMSUB vfnmsub.d #else @@ -198,6 +200,7 @@ static inline int WhereAmI(void){ #define XVCMPLT xvfcmp.clt.s #define XVMUL xvfmul.s #define XVMSUB xvfmsub.s +#define XVNMSUB xvfnmsub.s #define VFSUB vfsub.s #define VFADD vfadd.s @@ -212,6 +215,7 @@ static inline int WhereAmI(void){ #define VCMPLT vfcmp.clt.s #define VMUL vfmul.s #define VMSUB vfmsub.s +#define VNMSUB vfnmsub.s #endif /* defined(DOUBLE) */ diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 01f8e4782..c365e9a75 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -95,4 +95,16 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index a9e8abaf0..68360faaf 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -107,13 +107,35 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S +CGEMMKERNEL = cgemm_kernel_2x2_lsx.S +CGEMMONCOPY = cgemm_ncopy_2_lsx.S +CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S +ZGEMMONCOPY = zgemm_ncopy_2_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S -endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lasx.S b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S new file mode 100644 index 000000000..e07f7dc64 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S @@ -0,0 +1,857 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LASX vectors */ +#define U0 $xr30 +#define U1 $xr31 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define VALPHAR $xr28 +#define VALPHAI $xr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVNMSUB +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFSUB +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVFMADD +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFADD +#define XVFADD3 XVFADD +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVFMADD +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFADD +#define XVFADD4 XVFADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVNMSUB +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define XVFADD1 XVFADD +#define XVFADD2 XVFSUB +#define XVFADD3 XVFSUB +#define XVFADD4 XVFSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + xvldrepl.w VALPHAR, $sp, 112 + xvldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 1 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvpermi.d U0, U0, 0xd8 + xvpermi.d U2, U2, 0xd8 + + xvst U0, TD, 0x00 + xvst U2, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_2_lsx.S b/kernel/loongarch64/cgemm_ncopy_2_lsx.S new file mode 100644 index 000000000..1cf4d87dc --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S @@ -0,0 +1,202 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + slli.d T0, TL, 0x01 + + srai.d I, N, 0x01 + beq I, ZERO, .L_N0 + +.L_J1: /* if (i > 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset1 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TS, TS, 0x20 // a_offset + addi.d TD, TD, 0x20 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d TS, TS, 0x08 // a_offset + addi.d TD, TD, 0x08 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lasx.S b/kernel/loongarch64/cgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..e2245e412 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lasx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x00 + xvld U2, S2, 0x00 + + xvpermi.q U0, U2, 0x02 + xvpermi.q U2, U1, 0x31 + + xvst U0, S8, 0x00 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + xvst U2, S8, 0x00 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S8, 0x00 + vst $vr1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld $vr0, TS, 0x00 + vld $vr1, TS, 0x10 + + vst $vr0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst $vr1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, TS, 0x00 + + vst $vr0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_2_lsx.S b/kernel/loongarch64/cgemm_tcopy_2_lsx.S new file mode 100644 index 000000000..15c0fde8f --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_2_lsx.S @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x20 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U2, S8, 0x10 + + slli.d T0, M, 0x04 + add.d S8, S8, T0 + + vst U1, S8, 0x00 + vst U3, S8, 0x10 + + add.d S8, S8, T0 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + fst.s F2, S9, 0x08 + fst.s F3, S9, 0x0c + + addi.d S9, S9, 0x10 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, TS, 0x00 + vld U1, TS, 0x10 + + vst U0, TD, 0x00 + + slli.d T0, M, 0x04 + add.d TD, TD, T0 + + vst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, TS, 0x00 + + vst U0, TD, 0x00 + + addi.d TS, TS, 0x10 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, TS, 0x00 + fld.s F1, TS, 0x04 + + fst.s F0, S9, 0x00 + fst.s F1, S9, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_kernel_2x2.S b/kernel/loongarch64/zgemm_kernel_2x2.S new file mode 100644 index 000000000..589d170c5 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel_2x2.S @@ -0,0 +1,848 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 +#define c51 $f26 +#define c52 $f27 +#define c61 $f28 +#define c62 $f29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -88 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + ST $f23, $sp, 32 + ST $f24, $sp, 40 + ST $f25, $sp, 48 + ST $f26, $sp, 56 + ST $f27, $sp, 64 + ST $f28, $sp, 72 + ST $f29, $sp, 80 + + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, BASE_SHIFT + + move J, $r0 + srai.d T0, N, 1 + beq J, T0, .L19 + +.L10: /* for(j=0; j 0) I-- */ + move S1, TS //a_offset1 + add.d S2, TS, TL //a_offset2 + srai.d J, M, 0x02 + add.d TS, TS, T0 + + beq J, ZERO, .L_I3 + +.L_I1: /* if (j > 0) J-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset1 + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_I1 + +.L_I3: + andi J, M, 0x03 + beq J, ZERO, .L_II20 + +.L_II1: /* j = (m & 3) if (j > 0) */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d J, J, -1 + blt ZERO, J, .L_II1 + +.L_II20: + addi.d I, I, -1 + blt ZERO, I, .L_J1 + +.L_N0: /* if(n&1)*/ + andi I, N, 0x01 + beq ZERO, I, .L_N00 + +.L_N1: + srai.d J, M, 0x02 + beq ZERO, J, .L_N10 + +.L_N11: /* j = (m >> 2) if (j > 0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TS, TS, 0x40 // a_offset + addi.d TD, TD, 0x40 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N11 + +.L_N10: + andi J, M, 0x03 + beq J, ZERO, .L_N00 + +.L_N12: /* j = (m & 3) if (j > 0) */ + vld $vr0, TS, 0x00 + vst $vr0, TD, 0x00 + + + addi.d TS, TS, 0x10 // a_offset + addi.d TD, TD, 0x10 // b_offset + + addi.d J, J, -1 + blt ZERO, J, .L_N12 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_2_lasx.S b/kernel/loongarch64/zgemm_tcopy_2_lasx.S new file mode 100644 index 000000000..3fe17beef --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_2_lasx.S @@ -0,0 +1,212 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r7 +#define T0 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + srai.d J, M, 0x01 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 + add.d S2, S1, TL //aoffset2 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U2, S8, 0x20 + + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + xvst U1, S8, 0x00 + xvst U3, S8, 0x20 + + add.d S8, S8, T0 + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S9, S9, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, TS, 0x00 + xvld U1, TS, 0x20 + + xvst U0, TD, 0x00 + + slli.d T0, M, 0x05 + add.d TD, TD, T0 + + xvst U1, TD, 0x00 + + add.d TD, TD, T0 + addi.d TS, TS, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, TS, 0x00 + + xvst U0, TD, 0x00 + + addi.d TS, TS, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, TS, 0x00 + + vst $vr0, S9, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/param.h b/param.h index e4e242d5d..54760a82e 100644 --- a/param.h +++ b/param.h @@ -2853,13 +2853,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 From 0f648ebcd15d4a577eae9584e95b804acc25cc3f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Dec 2023 20:31:32 +0100 Subject: [PATCH 533/718] use alternate download for the CLFS cross-compiler package --- .github/workflows/loongarch64.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index 42393eb0c..f1bf8064c 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -40,7 +40,8 @@ jobs: - name: Download and install loongarch64-toolchain run: | - wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz + #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt - name: Set env From 44b5b9e39fa9bc98eb880d3f781ee4c337459ec6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Dec 2023 22:50:40 +0100 Subject: [PATCH 534/718] Update C/ZGEMM MN for Loongson2k1000 --- param.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/param.h b/param.h index 03cea76c9..03d6fa11d 100644 --- a/param.h +++ b/param.h @@ -2891,11 +2891,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 1 -#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_M 1 -#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 From 2802478449b7ea9bc7d1517c055635e8037908dc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Dec 2023 23:35:51 +0100 Subject: [PATCH 535/718] revert change to Loongson2k1000 zgemm --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 03d6fa11d..f3731c491 100644 --- a/param.h +++ b/param.h @@ -2894,8 +2894,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 128 From d3451af03f97a5087cbfcdc1ca83cdf07616d7b8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Dec 2023 12:35:37 +0100 Subject: [PATCH 536/718] Fix uninitialized read/wrong variable (Reference-LAPACK PR 967) --- lapack-netlib/SRC/cbbcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/cbbcsd.f b/lapack-netlib/SRC/cbbcsd.f index 1cd1ffbf1..4d0c45efe 100644 --- a/lapack-netlib/SRC/cbbcsd.f +++ b/lapack-netlib/SRC/cbbcsd.f @@ -805,7 +805,7 @@ CALL SLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1), $ RWORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL SLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1), + CALL SLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1), $ RWORK(IU2SN+I-1) ) ELSE CALL SLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1), From bd787c8a1a7039fe4c0e6df5c1d9ff70738342dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Dec 2023 12:36:47 +0100 Subject: [PATCH 537/718] Fix uninitialized read/wrong variable (Reference-LAPACK PR 967) --- lapack-netlib/SRC/dbbcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/dbbcsd.f b/lapack-netlib/SRC/dbbcsd.f index 4fcf9ab5d..913f96a73 100644 --- a/lapack-netlib/SRC/dbbcsd.f +++ b/lapack-netlib/SRC/dbbcsd.f @@ -805,7 +805,7 @@ CALL DLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1), $ WORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL DLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1), + CALL DLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1), $ WORK(IU2SN+I-1) ) ELSE CALL DLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1), From 1b668479deb9e1ef229c8f40b96d6f56b60402c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Dec 2023 12:37:52 +0100 Subject: [PATCH 538/718] Fix uninitialized read/wrong variable (Reference-LAPACK PR 967) --- lapack-netlib/SRC/sbbcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/sbbcsd.f b/lapack-netlib/SRC/sbbcsd.f index d5720cb33..2a619cb71 100644 --- a/lapack-netlib/SRC/sbbcsd.f +++ b/lapack-netlib/SRC/sbbcsd.f @@ -805,7 +805,7 @@ CALL SLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1), $ WORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL SLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1), + CALL SLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1), $ WORK(IU2SN+I-1) ) ELSE CALL SLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1), From 00d7476b4beba9cbd0cceac89e483e14664b9037 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Dec 2023 12:39:21 +0100 Subject: [PATCH 539/718] Fix uninitialized read/wrong variable (Reference-LAPACK PR 967) --- lapack-netlib/SRC/zbbcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/zbbcsd.f b/lapack-netlib/SRC/zbbcsd.f index 757e98c71..6601f4a06 100644 --- a/lapack-netlib/SRC/zbbcsd.f +++ b/lapack-netlib/SRC/zbbcsd.f @@ -804,7 +804,7 @@ CALL DLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1), $ RWORK(IU2CS+I-1), R ) ELSE IF( NU .LT. MU ) THEN - CALL DLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1), + CALL DLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1), $ RWORK(IU2SN+I-1) ) ELSE CALL DLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1), From 4a15d7242037862053a6332844f3838cd1ec64e3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 Dec 2023 16:30:57 +0100 Subject: [PATCH 540/718] AzureCI: Update alpine-chroot-install (#4403) * Update alpine-chroot-install --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 317bc504a..3ae8615a7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -288,9 +288,9 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | - wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ - && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ - || exit 1 + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \ + && echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06 alpine-chroot-install' | sha1sum -c \ + || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 From 504f9b0c5e4d7a9f6d87639d18dfb193e5b0b980 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 18:46:21 +0100 Subject: [PATCH 541/718] Increase S/D GEMM PQ to match typical L2 size as forNeoverseV1 --- param.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/param.h b/param.h index f3731c491..469c38ce3 100644 --- a/param.h +++ b/param.h @@ -3359,13 +3359,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 128 -#define DGEMM_DEFAULT_P 160 +#define SGEMM_DEFAULT_P 240 +#define DGEMM_DEFAULT_P 240 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 352 -#define DGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 640 +#define DGEMM_DEFAULT_Q 320 #define CGEMM_DEFAULT_Q 224 #define ZGEMM_DEFAULT_Q 112 From 03713bc464bf20a57ad777a41820c608fb5d0e8c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 22:08:49 +0100 Subject: [PATCH 542/718] Update Changelog for 0.3.26 --- Changelog.txt | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index e0fe0ca5a..b6139d6b7 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,49 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.26 + 2-Jan-2024 + +general: +- improved the version of openblas.pc that is created by the CMAKE build +- fixed a CMAKE-specific build problem on older versions of MacOS +- worked around linking problems on old versions of MacOS +- corrected installation location of the lapacke_mangling header in CMAKE builds +- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header +- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading +- imported additions and corrections from the Reference-LAPACK project: + - added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941) + - handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942) + - fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959) + - fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967) + - added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736) + - fixed several memory leaks in the LAPACK testsuite (PR 953) + - fixed counting of testsuite results by the Python script (PR 954) + +x86-64: +- fixed computation of CASUM on SkylakeX and newer targets in the special + case that AVX512 is not supported by the compiler or operating environment +- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets +- worked around a problem in the pre-AVX kernels for GEMV +- sped up the thread management code on MS Windows + +arm64: +- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer +- sped up the thread management code on MS Windows +- sped up SGEMM and DGEMM on Neoverse V1 and N1 +- sped up ?DOT on SVE-capable targets +- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones +- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds + +power: +- improved the SGEMM kernel for POWER10 +- fixed compilation with (very) old versions of gcc +- fixed detection of old 32bit PPC targets in CMAKE-based builds +- added autodetection of the POWERPC 7400 subtype +- fixed CMAKE-based compilation for PPCG4 and PPC970 targets + +loongarch64: +- added and improved optimized kernels for almost all BLAS functions + ==================================================================== Version 0.3.25 12-Nov-2023 From fde8bb990372980b840758bcef714bf565cdfb68 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 22:24:33 +0100 Subject: [PATCH 543/718] Update version to 0.3.26 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f3eac2edf..6d09f5658 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 25.dev) +set(OpenBLAS_PATCH_VERSION 26) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 6c77e5e314474773a7749357b153caba4ec3817d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 22:25:05 +0100 Subject: [PATCH 544/718] Update Makefile.rule --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 58f02358e..daf2d958d 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.25.dev +VERSION = 0.3.26 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 4f5da84e2f447b466db2a3d468dbee8881cc23a0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 22:32:27 +0100 Subject: [PATCH 545/718] Update version to 0.3.26.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d09f5658..5a1e4b271 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 26) +set(OpenBLAS_PATCH_VERSION 26.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 1412d2deeb32cfc1d80150eba520a5bba915f1c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jan 2024 22:33:01 +0100 Subject: [PATCH 546/718] Update version to 0.3.26.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index daf2d958d..8dbf5eab6 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.26 +VERSION = 0.3.26.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From e48627c999e3aadfbcdb2b1f1589d4e75840dfca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jan 2024 23:55:52 +0100 Subject: [PATCH 547/718] Add tests for ZSCAL with NaN and Inf arguments --- utest/CMakeLists.txt | 1 + utest/Makefile | 2 +- utest/test_zscal.c | 49 +++++++++++++++++++++++++++++++++++++++++++ utest/utest_main2.c | 50 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 utest/test_zscal.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 2e32827d3..c47954ce4 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -15,6 +15,7 @@ else () test_dsdot.c test_dnrm2.c test_swap.c + test_zscal.c ) endif () diff --git a/utest/Makefile b/utest/Makefile index f99035440..d0715c754 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,7 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o +OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_zscal.c b/utest/test_zscal.c new file mode 100644 index 000000000..b6310439f --- /dev/null +++ b/utest/test_zscal.c @@ -0,0 +1,49 @@ +#include "openblas_utest.h" +#include +#ifdef BUILD_COMPLEX16 + +CTEST(zscal, i_nan) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 1); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(zscal, nan_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + +CTEST(zscal, i_inf) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 1); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} + +CTEST(zscal, inf_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 1); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + +#endif diff --git a/utest/utest_main2.c b/utest/utest_main2.c index 4382bf159..8cb663190 100644 --- a/utest/utest_main2.c +++ b/utest/utest_main2.c @@ -617,6 +617,51 @@ CTEST(max, smax_zero){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } + +CTEST(zscal, i_nan) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 1); + ASSERT(isnan(nan[0]); + ASSERT(isnan(nan[1]); + ASSERT(isnan(nan[16]); + ASSERT(isnan(nan[17]); +} + +CTEST(zscal, nan_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 1); + ASSERT(isnan(i[0]); + ASSERT(isnan(i[1]); + ASSERT(isnan(i[16]); + ASSERT(isnan(i[17]); + } + +CTEST(zscal, i_inf) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 1); + ASSERT(isnan(inf[0]); + ASSERT(isinf(inf[1]); + ASSERT(isnan(inf[16]); + ASSERT(isinf(inf[17]); +} + +CTEST(zscal, inf_i) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 1); + ASSERT(isnan(i[0]); + ASSERT(isinf(i[1]); + ASSERT(isnan(i[16]); + ASSERT(isinf(i[17]); +} + int main(int argc, const char ** argv){ CTEST_ADD (amax, samax); @@ -648,7 +693,10 @@ int main(int argc, const char ** argv){ CTEST_ADD (swap,zswap_inc_0); CTEST_ADD (swap,sswap_inc_0); CTEST_ADD (swap,cswap_inc_0); - + CTEST_ADD (zscal, i_nan); + CTEST_ADD (zscal, nan_i); + CTEST_ADD (zscal, i_inf); + CTEST_ADD (zscal, inf_i); int num_fail=0; num_fail=ctest_main(argc, argv); From def4996170815ced665af2d1d8bbc08490feb809 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 15:29:42 +0100 Subject: [PATCH 548/718] Fix handling of NAN and INF arguments --- kernel/x86_64/zscal.c | 61 +++++++++++++++++++------------------- kernel/x86_64/zscal_sse2.S | 1 + 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 45e3531b8..b8d2a8754 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #include "common.h" - +#include #if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zscal_microk_skylakex-2.c" @@ -222,12 +222,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( da_r == 0.0 ) { - BLASLONG n1 = n & -2; if ( da_i == 0.0 ) { - while(j < n1) { @@ -253,7 +251,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - while(j < n1) { @@ -356,49 +353,59 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, alpha[0] = da_r; alpha[1] = da_i; - + if ( da_r == 0.0 ) if ( da_i == 0 ) zscal_kernel_8_zero(n1 , alpha , x); else - zscal_kernel_8_zero_r(n1 , alpha , x); +// zscal_kernel_8_zero_r(n1 , alpha , x); + zscal_kernel_8(n1 , alpha , x); else - if ( da_i == 0 ) + if ( da_i == 0 && da_r == da_r) zscal_kernel_8_zero_i(n1 , alpha , x); else zscal_kernel_8(n1 , alpha , x); - + } i = n1 << 1; j = n1; - } - - - if ( da_r == 0.0 ) + + if ( da_r == 0.0 || da_r != da_r ) { - if ( da_i == 0.0 ) { - + FLOAT res=0.0; + if (da_r != da_r) res= da_r; while(j < n) { - - x[i]=0.0; - x[i+1]=0.0; + x[i]=res; + x[i+1]=res; i += 2 ; j++; } } - else + else if (da_r < -FLT_MAX || da_r > FLT_MAX) { + while(j < n) + { + x[i]=SNAN; + x[i+1]=da_r; + i += 2 ; + j++; + + } + + } else { while(j < n) { - temp0 = -da_i * x[i+1]; + if (x[i] < -FLT_MAX || x[i] > FLT_MAX) + temp0 = SNAN; x[i+1] = da_i * x[i]; - x[i] = temp0; + if ( x[i] == x[i]) //preserve NaN + x[i] = temp0; i += 2 ; j++; @@ -409,12 +416,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - - if ( da_i == 0.0 ) + if (da_i == 0.0) { - - while(j < n) - { + while(j < n) + { temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; @@ -422,15 +427,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, i += 2 ; j++; - } - + } } else { while(j < n) { - temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; x[i] = temp0; @@ -445,5 +448,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, return(0); } - - diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S index 223b1e439..d6a49136d 100644 --- a/kernel/x86_64/zscal_sse2.S +++ b/kernel/x86_64/zscal_sse2.S @@ -82,6 +82,7 @@ pxor %xmm15, %xmm15 comisd %xmm0, %xmm15 jne .L100 + jp .L100 comisd %xmm1, %xmm15 jne .L100 From c9df62e88375fae17adf555d73a757e2e8f65cab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 17:49:40 +0100 Subject: [PATCH 549/718] Fix handling of NAN --- kernel/arm64/zscal.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index 929455975..4bd43320d 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -223,7 +223,7 @@ zscal_begin: fcmp DA_I, #0.0 beq .Lzscal_kernel_RI_zero - b .Lzscal_kernel_R_zero +// b .Lzscal_kernel_R_zero .Lzscal_kernel_R_non_zero: From d3d99c34f24a426bd90dfd97f76683dcd0eb2006 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 17:56:51 +0100 Subject: [PATCH 550/718] Fix handling of NAN and INF --- kernel/mips/zscal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index bca1155c1..d8cbffcf9 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -63,6 +63,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } + if (!isnan(x_ip] && !isinf(x[ip])) x[ip] = temp; ip += inc_x2; From 711433fcf0257cd07cf728c64e175960d0da38a2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 18:01:58 +0100 Subject: [PATCH 551/718] Update zscal.c --- kernel/mips/zscal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index d8cbffcf9..127f4f444 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -63,9 +63,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } - if (!isnan(x_ip] && !isinf(x[ip])) +// if (!isnan(x_ip] && !isinf(x[ip])) x[ip] = temp; - + if ( da_r != da_r ) x[ip] = da_r; ip += inc_x2; } From 903589f84b97b3975fa29a0f2a2fcef418040721 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 18:37:00 +0100 Subject: [PATCH 552/718] Update zscal.c --- kernel/mips/zscal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index 127f4f444..e76fb1ff1 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -63,7 +63,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } -// if (!isnan(x_ip] && !isinf(x[ip])) +// if (!isnan(x[ip]) && !isinf(x[ip])) x[ip] = temp; if ( da_r != da_r ) x[ip] = da_r; ip += inc_x2; From 0c33b57f5fb3721b2c6aa6fb5c7ddd09e17d6542 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 18:40:19 +0100 Subject: [PATCH 553/718] Handle NAN in input --- kernel/riscv64/zscal_vector.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index d275b75f8..24b44952d 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -59,7 +59,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if(da_r == 0.0 && da_i == 0.0){ + if (isnan(da_r)) { + gvl = VSETVL(n); + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG inc_xv = inc_x * 2 * gvl; + vt = VFMVVF_FLOAT(da_r, gvl); + for(i=0,j=0; i < n/(gvl*2); i++){ + VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); + + j += gvl*2; + ix += inc_xv*2; + } + for(; j < n; ){ + gvl = VSETVL(n-j); + VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); + j += gvl; + ix += inc_x * 2 * gvl; + } + else if(da_r == 0.0 && da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; From b08a208365f0bfbc8c85b85b4401d3a05f57a222 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 19:14:41 +0100 Subject: [PATCH 554/718] Update zscal_vector.c --- kernel/riscv64/zscal_vector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 24b44952d..33cb5d7ea 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -80,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F j += gvl; ix += inc_x * 2 * gvl; } - else if(da_r == 0.0 && da_i == 0.0){ + } else if(da_r == 0.0 && da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; From 2173356d5b63ea5e5f861bfebea387bc41d9e3b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 19:45:28 +0100 Subject: [PATCH 555/718] Update zscal_vector.c --- kernel/riscv64/zscal_vector.c | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 33cb5d7ea..a943135df 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -59,28 +59,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if (isnan(da_r)) { - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - vt = VFMVVF_FLOAT(da_r, gvl); - for(i=0,j=0; i < n/(gvl*2); i++){ - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); - j += gvl*2; - ix += inc_xv*2; - } - for(; j < n; ){ - gvl = VSETVL(n-j); - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - j += gvl; - ix += inc_x * 2 * gvl; - } - } else if(da_r == 0.0 && da_i == 0.0){ + if(da_r == 0.0 && da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; @@ -101,6 +81,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F j += gvl; ix += inc_x * 2 * gvl; } +#if 0 }else if(da_r == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -129,6 +110,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); } + #endif }else if(da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); From 91bbde7f64ef9d176e69416897ce5e255c351ec8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 20:21:05 +0100 Subject: [PATCH 556/718] revert accidental direct commit to develop --- kernel/mips/zscal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index e76fb1ff1..bca1155c1 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -63,9 +63,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } -// if (!isnan(x[ip]) && !isinf(x[ip])) x[ip] = temp; - if ( da_r != da_r ) x[ip] = da_r; + ip += inc_x2; } From f052bd4705210ef6c7293447b236f3a634799ba1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 20:21:48 +0100 Subject: [PATCH 557/718] revert accidental direct commit to develop --- kernel/riscv64/zscal_vector.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index a943135df..d275b75f8 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -59,7 +59,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if(da_r == 0.0 && da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -81,7 +80,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F j += gvl; ix += inc_x * 2 * gvl; } -#if 0 }else if(da_r == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -110,7 +108,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); } - #endif }else if(da_i == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); From acf17a825dadf070d595b901ae10d7f21db5aa25 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 20:26:16 +0100 Subject: [PATCH 558/718] Handle NAN in input --- kernel/mips/zscal.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index bca1155c1..d69477f97 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -63,8 +63,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; } } - x[ip] = temp; - + if ( da_r != da_r ) + x[ip] = da_r; + else + x[ip] = temp; + ip += inc_x2; } From f0808d856b69d750facf288460b65f0e9b0dde65 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 20:27:29 +0100 Subject: [PATCH 559/718] Handle NAN in input --- kernel/riscv64/zscal_vector.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index d275b75f8..77f4fc312 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -80,6 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F j += gvl; ix += inc_x * 2 * gvl; } +#if 0 }else if(da_r == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -97,6 +98,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F j += gvl; ix += inc_xv; } +#endif if(j < n){ gvl = VSETVL(n-j); v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); From cf8b03ae8b59d41ea74fd7f8f52efe9fed897c23 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jan 2024 23:09:57 +0100 Subject: [PATCH 560/718] Use NAN rather than SNAN for portability --- kernel/x86_64/zscal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index b8d2a8754..66c8a0d2b 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -388,8 +388,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else if (da_r < -FLT_MAX || da_r > FLT_MAX) { while(j < n) { - x[i]=SNAN; - x[i+1]=da_r; + x[i]= NAN; + x[i+1] = da_r; i += 2 ; j++; @@ -402,7 +402,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { temp0 = -da_i * x[i+1]; if (x[i] < -FLT_MAX || x[i] > FLT_MAX) - temp0 = SNAN; + temp0 = NAN; x[i+1] = da_i * x[i]; if ( x[i] == x[i]) //preserve NaN x[i] = temp0; From 5e7f714e93848c72faa2d2e658b8584aa395f603 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Jan 2024 08:17:40 +0100 Subject: [PATCH 561/718] Update zscal.c --- kernel/mips/zscal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index d69477f97..b117db1dd 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -47,6 +47,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = x[ip]; x[ip+1] = da_i * x[ip] ; } } From 25b0c48082a34bb2e5e9beebfdb65810e2e5affc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Jan 2024 09:49:18 +0100 Subject: [PATCH 562/718] Update zscal.c --- kernel/mips/zscal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index b117db1dd..7bb261941 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -47,7 +47,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; - if (isnan(x[ip]) || isinf(x[ip])) temp = x[ip]; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } From f637e1271391765b4e9fc87b958ecb4fd7541460 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Jan 2024 09:52:38 +0100 Subject: [PATCH 563/718] Handle INF and NAN --- kernel/riscv64/zscal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c index 0521aaa0b..b2d537d04 100644 --- a/kernel/riscv64/zscal.c +++ b/kernel/riscv64/zscal.c @@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } From 7ee1ee38e22904837afd3e51dd67f91cd499185f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Jan 2024 14:20:07 +0100 Subject: [PATCH 564/718] Handle NaN in input --- kernel/x86/zscal_sse.S | 1 + kernel/x86/zscal_sse2.S | 1 + 2 files changed, 2 insertions(+) diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S index e011c98f5..89e36251e 100644 --- a/kernel/x86/zscal_sse.S +++ b/kernel/x86/zscal_sse.S @@ -87,6 +87,7 @@ xorps %xmm7, %xmm7 comiss %xmm0, %xmm7 jne .L100 # Alpha_r != ZERO + jp .L100 # Alpha_r NaN comiss %xmm1, %xmm7 jne .L100 # Alpha_i != ZERO diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S index cc7ab6686..0bc61b209 100644 --- a/kernel/x86/zscal_sse2.S +++ b/kernel/x86/zscal_sse2.S @@ -98,6 +98,7 @@ xorps %xmm7, %xmm7 comisd %xmm0, %xmm7 jne .L100 + jp .L100 comisd %xmm1, %xmm7 jne .L100 From 1c31f56e5a23f6dd7ebfd35014fc5a604e4702cd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 8 Jan 2024 16:11:25 +0100 Subject: [PATCH 565/718] Handle NAN --- kernel/x86/zscal.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86/zscal.S b/kernel/x86/zscal.S index 1eb518563..adb56edb9 100644 --- a/kernel/x86/zscal.S +++ b/kernel/x86/zscal.S @@ -98,7 +98,7 @@ fcomip %st(1), %st ffreep %st(0) jne .L30 - +jp .L30 EMMS pxor %mm0, %mm0 From 058dd2a4cb4a6139796e29629191db5f2899fa1e Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 8 Jan 2024 14:16:09 -0600 Subject: [PATCH 566/718] Replace two vector loads with one vector pair load and fix endianess of stores - DGEMM versions. --- kernel/power/dgemm_tcopy_16_power8.S | 3 + kernel/power/dgemm_tcopy_macros_16_power10.S | 655 +++++++++++++++++++ kernel/power/dgemm_tcopy_macros_16_power8.S | 6 + 3 files changed, 664 insertions(+) create mode 100644 kernel/power/dgemm_tcopy_macros_16_power10.S diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S index fc52e0202..7a3c04e8e 100644 --- a/kernel/power/dgemm_tcopy_16_power8.S +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -107,6 +107,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#ifdef POWER10 +#include "dgemm_tcopy_macros_16_power10.S" +#endif #include "dgemm_tcopy_macros_16_power8.S" #define STACKSIZE 144 diff --git a/kernel/power/dgemm_tcopy_macros_16_power10.S b/kernel/power/dgemm_tcopy_macros_16_power10.S new file mode 100644 index 000000000..3cb416e09 --- /dev/null +++ b/kernel/power/dgemm_tcopy_macros_16_power10.S @@ -0,0 +1,655 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x16', ` +#else +.macro COPY_4x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs40, o0, A1 + lxvpx vs42, o32, A1 + addi A1, A1, 64 + + lxvpx vs48, o0, A2 + lxvpx vs50, o32, A2 + addi A2, A2, 64 + + lxvpx vs4, o0, A3 + lxvpx vs6, o32, A3 + addi A3, A3, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o32, A0 + addi A0, A0, 64 + + lxvpx vs44, o0, A1 + lxvpx vs46, o32, A1 + addi A1, A1, 64 + + lxvpx vs12, o0, A2 + lxvpx vs2, o32, A2 + addi A2, A2, 64 + + lxvpx vs8, o0, A3 + lxvpx vs10, o32, A3 + addi A3, A3, 64 + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs48, o0, T1 + stxvd2x vs49, o16, T1 + stxvd2x vs50, o32, T1 + stxvd2x vs51, o48, T1 +#else + stxvd2x vs49, o0, T1 + stxvd2x vs48, o16, T1 + stxvd2x vs51, o32, T1 + stxvd2x vs50, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs12, o0, T1 + stxvd2x vs13, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 +#else + stxvd2x vs13, o0, T1 + stxvd2x vs12, o16, T1 + stxvd2x vs3, o32, T1 + stxvd2x vs2, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs4, o0, T1 + stxvd2x vs5, o16, T1 + stxvd2x vs6, o32, T1 + stxvd2x vs7, o48, T1 +#else + stxvd2x vs5, o0, T1 + stxvd2x vs4, o16, T1 + stxvd2x vs7, o32, T1 + stxvd2x vs6, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 +#else + stxvd2x vs9, o0, T1 + stxvd2x vs8, o16, T1 + stxvd2x vs11, o32, T1 + stxvd2x vs10, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x8', ` +#else +.macro COPY_4x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + lxvpx vs36, o0, A1 + lxvpx vs38, o32, A1 + addi A1, A1, 64 + + + lxvpx vs40, o0, A2 + lxvpx vs42, o32, A2 + addi A2, A2, 64 + + + lxvpx vs44, o0, A3 + lxvpx vs46, o32, A3 + addi A3, A3, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_4x4', ` +#else +.macro COPY_4x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + lxvpx vs34, o0, A1 + addi A1, A1, 32 + + + lxvpx vs36, o0, A2 + addi A2, A2, 32 + + + lxvpx vs38, o0, A3 + addi A3, A3, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x16', ` +#else +.macro COPY_2x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o32, A0 + addi A0, A0, 64 + + + lxvpx vs40, o0, A1 + lxvpx vs42, o32, A1 + addi A1, A1, 64 + + lxvpx vs44, o0, A1 + lxvpx vs46, o32, A1 + addi A1, A1, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs40, o0, T1 + stxvd2x vs41, o16, T1 + stxvd2x vs42, o32, T1 + stxvd2x vs43, o48, T1 +#else + stxvd2x vs41, o0, T1 + stxvd2x vs40, o16, T1 + stxvd2x vs43, o32, T1 + stxvd2x vs42, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs44, o0, T1 + stxvd2x vs45, o16, T1 + stxvd2x vs46, o32, T1 + stxvd2x vs47, o48, T1 +#else + stxvd2x vs45, o0, T1 + stxvd2x vs44, o16, T1 + stxvd2x vs47, o32, T1 + stxvd2x vs46, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x8', ` +#else +.macro COPY_2x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + lxvpx vs36, o0, A1 + lxvpx vs38, o0, A1 + addi A1, A1, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_2x4', ` +#else +.macro COPY_2x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + lxvpx vs34, o0, A1 + addi A1, A1, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x16', ` +#else +.macro COPY_1x16 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + lxvpx vs36, o0, A0 + lxvpx vs38, o0, A0 + addi A0, A0, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + addi T1, T1, 64 + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs36, o0, T1 + stxvd2x vs37, o16, T1 + stxvd2x vs38, o32, T1 + stxvd2x vs39, o48, T1 +#else + stxvd2x vs37, o0, T1 + stxvd2x vs36, o16, T1 + stxvd2x vs39, o32, T1 + stxvd2x vs38, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x8', ` +#else +.macro COPY_1x8 +#endif + + lxvpx vs32, o0, A0 + lxvpx vs34, o32, A0 + addi A0, A0, 64 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 + stxvd2x vs34, o32, T1 + stxvd2x vs35, o48, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 + stxvd2x vs35, o32, T1 + stxvd2x vs34, o48, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +#if defined(_AIX) +define(`COPY_1x4', ` +#else +.macro COPY_1x4 +#endif + + lxvpx vs32, o0, A0 + addi A0, A0, 32 + + + mr T1, BO + +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxvd2x vs32, o0, T1 + stxvd2x vs33, o16, T1 +#else + stxvd2x vs33, o0, T1 + stxvd2x vs32, o16, T1 +#endif + +#if defined(_AIX) +') +#else +.endm +#endif + diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S index 6c5b8ed62..93b488fd0 100644 --- a/kernel/power/dgemm_tcopy_macros_16_power8.S +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_4x16', ` #else @@ -275,6 +276,7 @@ define(`COPY_4x4', ` #else .endm #endif +#endif /********************************************************************************************** @@ -369,6 +371,7 @@ define(`COPY_4x1', ` * Macros for N=2 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_2x16', ` #else @@ -512,6 +515,7 @@ define(`COPY_2x4', ` #else .endm #endif +#endif /********************************************************************************************** @@ -580,6 +584,7 @@ define(`COPY_2x1', ` * Macros for N=1 and M=16 **********************************************************************************************/ +#ifndef POWER10 #if defined(_AIX) define(`COPY_1x16', ` #else @@ -675,6 +680,7 @@ define(`COPY_1x4', ` #else .endm #endif +#endif /********************************************************************************************** From 7df363e1e2daef0916faae7d0b399fc998e5433e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 00:08:52 +0100 Subject: [PATCH 567/718] temporarily disable the MSA C/ZSCAL kernels --- kernel/mips/KERNEL.P5600 | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 9a6e06d67..f0fb5e087 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -103,8 +103,10 @@ endif ifdef HAVE_MSA SSCALKERNEL = ../mips/sscal_msa.c DSCALKERNEL = ../mips/dscal_msa.c -CSCALKERNEL = ../mips/cscal_msa.c -ZSCALKERNEL = ../mips/zscal_msa.c +#CSCALKERNEL = ../mips/cscal_msa.c +#ZSCALKERNEL = ../mips/zscal_msa.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c else SSCALKERNEL = ../mips/scal.c DSCALKERNEL = ../mips/scal.c From 1dada6d65d89d19b2cf89b12169f6b2196c90f1d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 00:10:56 +0100 Subject: [PATCH 568/718] Add compiler test and flag for AVX512BF16 capability --- c_check | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/c_check b/c_check index b5e4a9ad0..3e507be81 100755 --- a/c_check +++ b/c_check @@ -244,6 +244,7 @@ case "$data" in esac no_avx512=0 +no_avx512bf=0 if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpf="$tmpd/a.c" @@ -262,6 +263,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then } rm -rf "$tmpd" + if [ "$no_avx512" -eq 0 ]; then + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') + tmpf="$tmpd/a.c" + code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"' + printf "#include \n\nint main(void){ %s; }\n" "$code" >> "$tmpf" + if [ "$compiler" = "PGI" ]; then + args=" -tp cooperlake -c -o $tmpf.o $tmpf" + else + args=" -march=cooperlake -c -o $tmpf.o $tmpf" + fi + no_avx512bf=0 + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_avx512bf=1 + } + + rm -rf "$tmpd" + fi fi no_rv64gv=0 @@ -409,6 +429,7 @@ done [ "$makefile" = "-" ] && { [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" + [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" exit 0 @@ -437,6 +458,7 @@ done [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" + [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" From 995a990e24fdcc8080128a8abc17b4ccc66bd4fd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 00:12:46 +0100 Subject: [PATCH 569/718] Make AVX512 BFLOAT16 kernels conditional on compiler capability --- kernel/x86_64/KERNEL.COOPERLAKE | 3 ++- kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index dba94aea8..22b042029 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -1,5 +1,5 @@ include $(KERNELDIR)/KERNEL.SKYLAKEX - +ifneq ($(NO_AVX512BF16), 1) SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c @@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index 3a832e917..0ab2b4ddc 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -1,5 +1,6 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE +ifneq ($(NO_AVX512BF16), 1) SBGEMM_SMALL_M_PERMIT = SBGEMM_SMALL_K_NN = SBGEMM_SMALL_K_B0_NN = @@ -20,3 +21,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif From d1ead06bd8707723e643099aa1072aefa914f660 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 09:29:13 +0100 Subject: [PATCH 570/718] define NAN and INFINITY if needed --- utest/test_zscal.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/utest/test_zscal.c b/utest/test_zscal.c index b6310439f..8992eee90 100644 --- a/utest/test_zscal.c +++ b/utest/test_zscal.c @@ -2,6 +2,13 @@ #include #ifdef BUILD_COMPLEX16 +#ifndef NAN +#define NAN 0.0/0.0 +#endif +#ifndef INFINITY +#define INFINITY 1.0/0.0 +#endif + CTEST(zscal, i_nan) { double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; From b57627c27f027692a4cc9725784e1b0e2e51bc49 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 12:03:08 +0100 Subject: [PATCH 571/718] Handle NAN and INF --- kernel/zarch/zscal.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index d39b8447e..d019d66df 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -233,9 +233,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n1) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; + if (isnan(x[i + inc_x]) || isinf(x[i + inc_x])) + temp1 = NAN; + else temp1 = -da_i * x[i + 1 + inc_x]; x[i + 1 + inc_x] = da_i * x[i + inc_x]; x[i + inc_x] = temp1; @@ -246,7 +252,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; i += inc_x; @@ -347,7 +356,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { - temp0 = -da_i * x[i + 1]; + if (isnan(x[i]) || isinf(x[i])) + temp0 = NAN; + else + temp0 = -da_i * x[i + 1]; x[i + 1] = da_i * x[i]; x[i] = temp0; i += 2; From 20413ee6ecb05b4746e107ec61614178e498509b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Jan 2024 13:11:13 +0100 Subject: [PATCH 572/718] Update zscal.c --- kernel/zarch/zscal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index d019d66df..4160a1a76 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -329,7 +329,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (da_i == 0) zscal_kernel_8_zero(n1, x); else - zscal_kernel_8_zero_r(n1, alpha, x); + zscal_kernel_8(n1, da_r, da_i, x); else if (da_i == 0) zscal_kernel_8_zero_i(n1, alpha, x); else From 2e2e538b7c493b727ed9ff160e18f086f5e17384 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Jan 2024 20:02:43 +0100 Subject: [PATCH 573/718] Add openblas_set_num_threads_local() and use of blas_omp_threads_local in OMP parallel regions --- cblas.h | 1 + common_thread.h | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cblas.h b/cblas.h index 8a5055cf8..ade2fca3a 100644 --- a/cblas.h +++ b/cblas.h @@ -12,6 +12,7 @@ extern "C" { /*Set the number of threads on runtime.*/ void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); +int openblas_set_num_threads_local(int num_threads); /*Get the number of threads on runtime.*/ int openblas_get_num_threads(void); diff --git a/common_thread.h b/common_thread.h index 9e7dae74a..d37fcb189 100644 --- a/common_thread.h +++ b/common_thread.h @@ -137,19 +137,20 @@ typedef struct blas_queue { extern int blas_server_avail; extern int blas_omp_number_max; +extern int blas_omp_threads_local; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP int openmp_nthreads; openmp_nthreads=omp_get_max_threads(); + if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local; #endif #ifndef USE_OPENMP if (blas_cpu_number == 1 -#endif -#ifdef USE_OPENMP - if (openmp_nthreads == 1 || omp_in_parallel() +#else + if (openmp_nthreads == 1 #endif ) return 1; From 87d31af2ae8ef42c924d33633968af6262ab4344 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Jan 2024 20:06:24 +0100 Subject: [PATCH 574/718] Add openblas_set_num_threads_local() --- driver/others/blas_server_omp.c | 1 + driver/others/openblas_set_num_threads.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 213531057..4d3d1e0de 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -69,6 +69,7 @@ int blas_server_avail = 0; int blas_omp_number_max = 0; +int blas_omp_threads_local = 1; extern int openblas_omp_adaptive_env(void); diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 0b57867b0..c938fde69 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef SMP_SERVER extern void openblas_set_num_threads(int num_threads) ; +extern int openblas_get_num_threads(void) ; void openblas_set_num_threads_(int* num_threads){ openblas_set_num_threads(*num_threads); } +int openblas_set_num_threads_local(int num_threads){ + int ret = openblas_get_num_threads(); + openblas_set_num_threads(num_threads); + blas_omp_threads_local=num_threads; + return ret; +} + + #else //Single thread @@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) { void openblas_set_num_threads_(int* num_threads){ } + +int openblas_set_num_threads_local(int num_threads){ + return 1; +} #endif From bf66af3dc0060af9a7b001470a5300efb8f8313b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Jan 2024 20:37:36 +0100 Subject: [PATCH 575/718] remove matrix dimension 6 from DGS to avoid spurious errors from FMA --- lapack-netlib/TESTING/dgd.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/dgd.in b/lapack-netlib/TESTING/dgd.in index ca24dd2a0..127b63789 100644 --- a/lapack-netlib/TESTING/dgd.in +++ b/lapack-netlib/TESTING/dgd.in @@ -1,6 +1,6 @@ DGS Data for the Real Nonsymmetric Schur Form Driver 5 Number of matrix dimensions -2 6 10 12 20 30 Matrix dimensions +2 10 12 20 30 Matrix dimensions 1 1 1 2 1 Parameters NB, NBMIN, NXOVER, NS, NBCOL 10 Threshold for test ratios .TRUE. Put T to test the error exits From 9fab60d32f595350d599d5fe901dfaae2ce56592 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Jan 2024 20:39:05 +0100 Subject: [PATCH 576/718] Remove matrix dimension 6 from SGS to avoid spurious errors from FMA --- lapack-netlib/TESTING/sgd.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/sgd.in b/lapack-netlib/TESTING/sgd.in index 164b3cc35..efab6c257 100644 --- a/lapack-netlib/TESTING/sgd.in +++ b/lapack-netlib/TESTING/sgd.in @@ -1,6 +1,6 @@ SGS Data for the Real Nonsymmetric Schur Form Driver 5 Number of matrix dimensions -2 6 10 12 20 30 Matrix dimensions +2 10 12 20 30 Matrix dimensions 1 1 1 2 1 Parameters NB, NBMIN, NXOVER, NS, NBCOL 10 Threshold for test ratios .TRUE. Put T to test the error exits From 8a9d492af7b26f6c94965301783f5895cae9cc3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jan 2024 19:58:49 +0100 Subject: [PATCH 577/718] Add default for blas_omp_threads_local --- driver/others/blas_server.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 2fcb37192..2531c57e9 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void); /* We need this global for checking if initialization is finished. */ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; +int blas_omp_threads_local = 1; + /* Local Variables */ #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; From 152a6c43b6c61607cc91d5285f94dbef2575ff4f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jan 2024 19:59:55 +0100 Subject: [PATCH 578/718] Add blas_omp_threads_local --- driver/others/blas_server_win32.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 40ff85abc..2d41af228 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -59,6 +59,8 @@ static CRITICAL_SECTION queue_lock; /* We need this global for checking if initialization is finished. */ int blas_server_avail = 0; +int blas_omp_threads_local = 1; + /* Local Variables */ static BLASULONG server_lock = 0; From 0d2e486edfb6919afea1bba62f5ac3ebc460cee0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 15 Jan 2024 11:18:59 +0100 Subject: [PATCH 579/718] Handle NAN and INF --- kernel/arm/zscal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c index 0521aaa0b..b2d537d04 100644 --- a/kernel/arm/zscal.c +++ b/kernel/arm/zscal.c @@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F else { temp = - da_i * x[ip+1] ; + if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; x[ip+1] = da_i * x[ip] ; } } From 025a1b2c7b02c9984c331fde39eb27e492aeb8e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 15 Jan 2024 22:40:21 +0100 Subject: [PATCH 580/718] Only use mtune=native when not cross-compiling --- Makefile.arm64 | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 1b10446f7..ed52a9424 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -104,9 +104,15 @@ ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 endif else -CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native +CCOMMON_OPT += -march=armv8.4-a+sve +ifneq ($(CROSS), 1) +CCOMMON_OPT += -mtune=native +endif ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv8.4-a -mtune=native +FCOMMON_OPT += -march=armv8.4-a +ifneq ($(CROSS), 1) +FCOMMON_OPT += -mtune=native +endif endif endif else @@ -138,9 +144,15 @@ ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 endif else -CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native +CCOMMON_OPT += -march=armv8.5-a+sve +ifneq ($(CROSS), 1) +CCOMMON_OPT += -mtune=native +endif ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv8.5-a -mtune=native +FCOMMON_OPT += -march=armv8.5-a +ifneq ($(CROSS), 1) +FCOMMON_OPT += -mtune=native +endif endif endif else From 8f4e325ea82f5d5f3791b147208cbaae8dde8f64 Mon Sep 17 00:00:00 2001 From: Ian McInerney Date: Mon, 15 Jan 2024 23:42:03 +0000 Subject: [PATCH 581/718] Fix Clang sapphire rapids march flag --- Makefile.x86_64 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 702447ace..c0dbe84c8 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC) endif endif else ifeq ($(C_COMPILER), CLANG) - # cooperlake support was added in clang 12 + # sapphire rapids support was added in clang 12 ifeq ($(CLANGVERSIONGTEQ12), 1) - CCOMMON_OPT += -march=cooperlake + CCOMMON_OPT += -march=sapphirerapids ifneq ($(F_COMPILER), NAG) - FCOMMON_OPT += -march=cooperlake + FCOMMON_OPT += -march=sapphirerapids endif else # not supported in clang, fallback to avx512 CCOMMON_OPT += -march=skylake-avx512 From 9edb805e645d3530e907864e242a3f64a881b28a Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Tue, 16 Jan 2024 14:24:18 +0000 Subject: [PATCH 582/718] fix builds with t-head toolchains that use old versions of the intrinsics spec --- common_riscv64.h | 17 +++++- cpuid_riscv64.c | 4 +- kernel/riscv64/amax_vector.c | 18 +++--- kernel/riscv64/amin_vector.c | 18 +++--- kernel/riscv64/asum_vector.c | 20 ++++--- kernel/riscv64/axpby_vector.c | 16 +++--- kernel/riscv64/axpy_vector.c | 12 ++-- kernel/riscv64/copy_vector.c | 14 ++--- kernel/riscv64/dot_vector.c | 44 +++++++++------ kernel/riscv64/gemv_n_vector.c | 24 ++++---- kernel/riscv64/gemv_t_vector.c | 42 ++++++++------ kernel/riscv64/iamax_vector.c | 80 +++++++++++++++------------ kernel/riscv64/iamin_vector.c | 82 ++++++++++++++++------------ kernel/riscv64/imax_vector.c | 82 ++++++++++++++++------------ kernel/riscv64/imin_vector.c | 80 ++++++++++++++++----------- kernel/riscv64/izamax_vector.c | 94 ++++++++++++++++++-------------- kernel/riscv64/izamin_vector.c | 92 ++++++++++++++++++------------- kernel/riscv64/max_vector.c | 20 ++++--- kernel/riscv64/min_vector.c | 20 ++++--- kernel/riscv64/nrm2_vector.c | 64 ++++++++++++---------- kernel/riscv64/rot_vector.c | 36 ++++++------ kernel/riscv64/scal_vector.c | 14 ++--- kernel/riscv64/sum_vector.c | 32 +++++------ kernel/riscv64/swap_vector.c | 10 ++-- kernel/riscv64/symv_L_vector.c | 56 +++++++++++-------- kernel/riscv64/symv_U_vector.c | 60 +++++++++++--------- kernel/riscv64/zamax_vector.c | 25 +++++---- kernel/riscv64/zamin_vector.c | 25 +++++---- kernel/riscv64/zasum_vector.c | 22 +++++--- kernel/riscv64/zaxpby_vector.c | 32 +++++------ kernel/riscv64/zaxpy_vector.c | 20 +++---- kernel/riscv64/zcopy_vector.c | 12 ++-- kernel/riscv64/zdot_vector.c | 60 +++++++++++--------- kernel/riscv64/zgemv_n_vector.c | 28 +++++----- kernel/riscv64/zgemv_t_vector.c | 56 +++++++++++-------- kernel/riscv64/zhemv_LM_vector.c | 60 +++++++++++--------- kernel/riscv64/zhemv_UV_vector.c | 60 +++++++++++--------- kernel/riscv64/znrm2_vector.c | 51 +++++++++-------- kernel/riscv64/zrot_vector.c | 36 ++++++------ kernel/riscv64/zscal_vector.c | 32 +++++------ kernel/riscv64/zsum_vector.c | 16 +++--- kernel/riscv64/zswap_vector.c | 10 ++-- 42 files changed, 900 insertions(+), 696 deletions(-) diff --git a/common_riscv64.h b/common_riscv64.h index de79c8cab..f11e8b75d 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,12 +91,23 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(__riscv_v) +#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) # include +#endif + +#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 ) +// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this +#define RISCV_0p10_INTRINSICS +#define RISCV_RVV(x) x +#else +#define RISCV_RVV(x) __riscv_ ## x +#endif + +#if defined(C910V) || defined(RISCV64_ZVL256B) # if !defined(DOUBLE) -# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f32m1_f32(v) +# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v) # else -# define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f64m1_f64(v) +# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v) # endif #else # define EXTRACT_FLOAT(v) (v[0]) diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 1b6b62f21..928b5ba92 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_GENERIC 0 #define CPU_C910V 1 -#define CPU_RISCV64_ZVL256B 2 +#define CPU_x280 2 +#define CPU_RISCV64_ZVL256B 3 static char *cpuname[] = { "RISCV64_GENERIC", "C910V", + "x280", "CPU_RISCV64_ZVL256B" }; diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index 81a39af32..b66d4871e 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -49,15 +49,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index c8ba75f4a..1c541f0fd 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -48,15 +48,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index d10bf99e6..995dbf9a1 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -49,16 +49,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index b77cb58fb..386c4a5f1 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -48,15 +48,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) -#define VFMACCVF_FLOAT JOIN(__riscv_vfmacc, _vf_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMULVF_FLOAT JOIN(__riscv_vfmul, _vf_f, ELEN, LMUL, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 3447107a6..e99ca8542 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -49,13 +49,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) -#define VFMACCVF_FLOAT JOIN(__riscv_vfmacc, _vf_f, ELEN, LMUL, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) +#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c index 710e8670a..ccbd6e482 100644 --- a/kernel/riscv64/copy_vector.c +++ b/kernel/riscv64/copy_vector.c @@ -47,12 +47,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { @@ -71,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/4){ BLASLONG inc_xv = inc_x * gvl; - BLASLONG gvl3 = gvl * 3; + unsigned int gvl3 = gvl * 3; BLASLONG inc_xv3 = inc_xv * 3; for(i=0,j=0; i #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m8 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 -#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) #else -#define VSETVL(n) __riscv_vsetvl_e64m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m8 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 -#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) +#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index baf3d8f69..3b467a586 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -53,12 +53,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c index f3b922195..cd89c63ec 100644 --- a/kernel/riscv64/symv_L_vector.c +++ b/kernel/riscv64/symv_L_vector.c @@ -27,35 +27,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 9977e2741..894c6a643 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 4301528bd..2dee5ab29 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -53,19 +53,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) +#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) +#else +#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) +#endif #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) -#define VFMAXVV_FLOAT JOIN(__riscv_vfmax, _vv_f, ELEN, LMUL, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index 095b1c3df..df9a7a7e1 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -55,19 +55,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) +#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) +#else +#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) +#endif #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt_vf_f, ELEN, LMUL, _b, MLEN) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f, ELEN, LMUL, _m) -#define VFMINVV_FLOAT JOIN(__riscv_vfmin, _vv_f, ELEN, LMUL, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c index 9136f0037..fca904d6a 100644 --- a/kernel/riscv64/zasum_vector.c +++ b/kernel/riscv64/zasum_vector.c @@ -53,17 +53,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFABS_FLOAT JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif +#define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index 404f51fb3..d5ad974cf 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 -#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 -#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 20bfe74ec..d19e51118 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c index 9da60acb0..9e4a67b71 100644 --- a/kernel/riscv64/zcopy_vector.c +++ b/kernel/riscv64/zcopy_vector.c @@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) #endif diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index 57542714a..13b8fe378 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 -#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 -#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index f4acad770..104d3865d 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c index 179454094..5d85ab3a4 100644 --- a/kernel/riscv64/zgemv_t_vector.c +++ b/kernel/riscv64/zgemv_t_vector.c @@ -27,31 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m2_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(vr, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(va, vb, gvl) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m2_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(vr, va, vb, gvl) +#else +#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(va, vb, gvl) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m2) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -93,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl); vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); #endif - v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl); - v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl); + v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl); + v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl); j += inc_av; ix += inc_xv; @@ -117,8 +125,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); #endif - v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl); - v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl); + v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl); + v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl); } temp_r = VFMVFS_FLOAT(v_res_r); diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c index e025120e5..117db7d84 100644 --- a/kernel/riscv64/zhemv_LM_vector.c +++ b/kernel/riscv64/zhemv_LM_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 0e1ea5436..7c6b63bf3 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 -#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 +#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl) +#else +#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) +#endif +#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) +#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index 437bf4246..8614f7539 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -52,37 +52,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VFABS JOIN(__riscv_vfabs, _v_f, ELEN, LMUL, _) -#define VMFNE JOIN(__riscv_vmfne_vf_f,ELEN, LMUL, _b, MLEN) -#define VMFGT JOIN(__riscv_vmfgt_vv_f,ELEN, LMUL, _b, MLEN) -#define VMFEQ JOIN(__riscv_vmfeq_vv_f,ELEN, LMUL, _b, MLEN) -#define VCPOP JOIN(__riscv_vcpop, _m_b, MLEN, _, _) -#define VFREDMAX JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) -#define VFIRST JOIN(__riscv_vfirst, _m_b, MLEN, _, _) -#define VRGATHER JOIN(__riscv_vrgather, _vx_f, ELEN, LMUL, _) -#define VFDIV JOIN(__riscv_vfdiv, _vf_f, ELEN, LMUL, _) -#define VFDIV_M JOIN(__riscv_vfdiv, _vv_f, ELEN, LMUL, _mu) -#define VFMUL JOIN(__riscv_vfmul, _vv_f, ELEN, LMUL, _) -#define VFMACC JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _) -#define VFMACC_M JOIN(__riscv_vfmacc, _vv_f, ELEN, LMUL, _mu) -#define VMSOF JOIN(__riscv_vmsof, _m_b, MLEN, _, _) -#define VMANDN JOIN(__riscv_vmandn, _mm_b, MLEN, _, _) -#define VFREDUSUM JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL, JOIN2(_f, ELEN), m1) +#define VFABS JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) +#define VMFNE JOIN(RISCV_RVV(vmfne_vf_f),ELEN, LMUL, _b, MLEN) +#define VMFGT JOIN(RISCV_RVV(vmfgt_vv_f),ELEN, LMUL, _b, MLEN) +#define VMFEQ JOIN(RISCV_RVV(vmfeq_vv_f),ELEN, LMUL, _b, MLEN) +#define VCPOP JOIN(RISCV_RVV(vcpop), _m_b, MLEN, _, _) +#ifdef RISCV_0p10_INTRINSICS +#define VFREDMAX(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl) +#define VFREDUSUM(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl) +#define VFDIV_M JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _m) +#define VFMACC_M JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _m) +#else +#define VFREDMAX JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1) +#define VFREDUSUM JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL, JOIN2(_f, ELEN), m1) +#define VFDIV_M JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _mu) +#define VFMACC_M JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _mu) +#endif +#define VFIRST JOIN(RISCV_RVV(vfirst), _m_b, MLEN, _, _) +#define VRGATHER JOIN(RISCV_RVV(vrgather), _vx_f, ELEN, LMUL, _) +#define VFDIV JOIN(RISCV_RVV(vfdiv), _vf_f, ELEN, LMUL, _) +#define VFMUL JOIN(RISCV_RVV(vfmul), _vv_f, ELEN, LMUL, _) +#define VFMACC JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _) +#define VMSOF JOIN(RISCV_RVV(vmsof), _m_b, MLEN, _, _) +#define VMANDN JOIN(RISCV_RVV(vmandn), _mm_b, MLEN, _, _) #if defined(DOUBLE) #define ABS fabs #else #define ABS fabsf #endif -#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v) +#define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c index c3afbc7cc..50751b343 100644 --- a/kernel/riscv64/zrot_vector.c +++ b/kernel/riscv64/zrot_vector.c @@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT __riscv_vle32_v_f32m4 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 5d9ab7b28..2034aafaa 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() #define FLOAT_V_T vfloat32m4_t -#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m1() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() #define FLOAT_V_T vfloat64m4_t -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 -#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 -#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) diff --git a/kernel/riscv64/zsum_vector.c b/kernel/riscv64/zsum_vector.c index 7aab15105..ca0b02b5c 100644 --- a/kernel/riscv64/zsum_vector.c +++ b/kernel/riscv64/zsum_vector.c @@ -53,16 +53,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f, ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFMVVF_FLOAT JOIN(__riscv_vfmv, _v_f_f, ELEN, LMUL, _) -#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv, _v_f_f, ELEN, m1, _) -#define VFADDVV_FLOAT JOIN(__riscv_vfadd, _vv_f, ELEN, LMUL, _) -#define VMFLTVF_FLOAT JOIN(__riscv_vmflt, _vf_f, ELEN, LMUL, MLEN) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) +#define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt), _vf_f, ELEN, LMUL, MLEN) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c index d8980602d..02c98b588 100644 --- a/kernel/riscv64/zswap_vector.c +++ b/kernel/riscv64/zswap_vector.c @@ -53,12 +53,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JOIN2(x, y) JOIN2_X(x, y) #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) -#define VSETVL JOIN(__riscv_vsetvl, _e, ELEN, LMUL, _) +#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) -#define VLEV_FLOAT JOIN(__riscv_vle, ELEN, _v_f, ELEN, LMUL) -#define VLSEV_FLOAT JOIN(__riscv_vlse, ELEN, _v_f, ELEN, LMUL) -#define VSEV_FLOAT JOIN(__riscv_vse, ELEN, _v_f, ELEN, LMUL) -#define VSSEV_FLOAT JOIN(__riscv_vsse, ELEN, _v_f, ELEN, LMUL) +#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) +#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) +#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) +#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { From ec89466e14d3dff51128682b3975f8554439a2f1 Mon Sep 17 00:00:00 2001 From: Dirreke Date: Tue, 16 Jan 2024 23:45:06 +0800 Subject: [PATCH 583/718] Add CSKY support --- CONTRIBUTORS.md | 5 +- Makefile.csky | 4 + Makefile.prebuild | 4 + Makefile.system | 5 ++ TargetList.txt | 4 + c_check | 2 + c_check.pl | 7 ++ common.h | 4 + common_csky.h | 56 ++++++++++++++ ctest.c | 4 + getarch.c | 34 ++++++++- kernel/csky/KERNEL | 149 +++++++++++++++++++++++++++++++++++++ kernel/csky/Makefile | 1 + lapack/laswp/csky/Makefile | 13 ++++ param.h | 37 +++++++++ 15 files changed, 325 insertions(+), 4 deletions(-) create mode 100644 Makefile.csky create mode 100644 common_csky.h create mode 100644 kernel/csky/KERNEL create mode 100644 kernel/csky/Makefile create mode 100644 lapack/laswp/csky/Makefile diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 493747052..419f90dab 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -218,4 +218,7 @@ In chronological order: * [2022-08] Fix building from sources for QNX * Mark Seminatore - * [2023-11-09] Improve Windows threading performance scaling \ No newline at end of file + * [2023-11-09] Improve Windows threading performance scaling + +* Dirreke + * [2024-01-16] Add basic support for the CSKY architecture diff --git a/Makefile.csky b/Makefile.csky new file mode 100644 index 000000000..36162af2a --- /dev/null +++ b/Makefile.csky @@ -0,0 +1,4 @@ +ifeq ($(CORE), CK860FV) +CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static +endif diff --git a/Makefile.prebuild b/Makefile.prebuild index 0be4f1274..83da8e2ce 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -55,6 +55,10 @@ ifeq ($(TARGET), C910V) TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d endif +ifeq ($(TARGET), CK860FV) +TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.system b/Makefile.system index e602eaf05..0088eaff5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -873,6 +873,11 @@ endif endif endif +ifeq ($(ARCH), csky) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + # # C Compiler dependent settings # diff --git a/TargetList.txt b/TargetList.txt index deef75819..c11b94fa5 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -133,3 +133,7 @@ E2K EV4 EV5 EV6 + +14.CSKY +CSKY +CK860FV diff --git a/c_check b/c_check index 3e507be81..59ab9bb13 100755 --- a/c_check +++ b/c_check @@ -91,6 +91,7 @@ case "$data" in *ARCH_ZARCH*) architecture=zarch ;; *ARCH_RISCV64*) architecture=riscv64 ;; *ARCH_LOONGARCH64*) architecture=loongarch64 ;; + *ARCH_CSKY*) architecture=csky ;; esac defined=0 @@ -236,6 +237,7 @@ case "$data" in *ARCH_ARM*) architecture=arm ;; *ARCH_ZARCH*) architecture=zarch ;; *ARCH_LOONGARCH64*) architecture=loongarch64 ;; + *ARCH_CSKY*) architecture=csky ;; esac binformat='bin32' diff --git a/c_check.pl b/c_check.pl index d9c36793c..6b89f06eb 100644 --- a/c_check.pl +++ b/c_check.pl @@ -97,6 +97,7 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); +$architecture = csky if ($data =~ /ARCH_CSKY/); $defined = 0; @@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") { $binary = 64; } +if ($architecture eq "csky") { + $defined = 1; + $binary = 32; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); @@ -284,6 +290,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/); $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); +$architecture = csky if ($data =~ /ARCH_CSKY/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); diff --git a/common.h b/common.h index 462c1d428..1f0b9e533 100644 --- a/common.h +++ b/common.h @@ -482,6 +482,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_e2k.h" #endif +#ifdef ARCH_CSKY +#include "common_csky.h" +#endif + #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; diff --git a/common_csky.h b/common_csky.h new file mode 100644 index 000000000..3095dc781 --- /dev/null +++ b/common_csky.h @@ -0,0 +1,56 @@ +/***************************************************************************** +Copyright (c) 2011-2015, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#ifndef COMMON_CSKY +#define COMMON_CSKY + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#endif + + + +#define BUFFER_SIZE ( 32 << 20) +#define SEEK_ADDRESS + +#endif \ No newline at end of file diff --git a/ctest.c b/ctest.c index 2ccae8dcc..cbc15326f 100644 --- a/ctest.c +++ b/ctest.c @@ -173,6 +173,10 @@ HAVE_C11 ARCH_E2K #endif +#if defined(__csky__) +ARCH_CSKY +#endif + #if defined(__EMSCRIPTEN__) ARCH_RISCV64 OS_WINDOWS diff --git a/getarch.c b/getarch.c index 87384c084..2d26da079 100644 --- a/getarch.c +++ b/getarch.c @@ -150,6 +150,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_EV4 */ /* #define FORCE_EV5 */ /* #define FORCE_EV6 */ +/* #define FORCE_CSKY */ +/* #define FORCE_CK860FV */ /* #define FORCE_GENERIC */ #ifdef FORCE_P2 @@ -1692,6 +1694,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "generic" #endif +#ifdef FORCE_CSKY +#define FORCE +#define ARCHITECTURE "CSKY" +#define SUBARCHITECTURE "CSKY" +#define SUBDIRNAME "csky" +#define ARCHCONFIG "-DCSKY" \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "csky" +#define CORENAME "CSKY" +#endif + +#ifdef FORCE_CK860FV +#define FORCE +#define ARCHITECTURE "CSKY" +#define SUBARCHITECTURE "CK860V" +#define SUBDIRNAME "csky" +#define ARCHCONFIG "-DCK860FV " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "ck860fv" +#define CORENAME "CK860FV" +#endif + + #ifndef FORCE #ifdef USER_TARGET @@ -1766,7 +1795,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif - #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." #endif @@ -1831,7 +1859,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -1979,7 +2007,7 @@ printf("ELF_VERSION=2\n"); #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/kernel/csky/KERNEL b/kernel/csky/KERNEL new file mode 100644 index 000000000..afa8a0881 --- /dev/null +++ b/kernel/csky/KERNEL @@ -0,0 +1,149 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c +LSAME_KERNEL = ../generic/lsame.c + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + + diff --git a/kernel/csky/Makefile b/kernel/csky/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/csky/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/lapack/laswp/csky/Makefile b/lapack/laswp/csky/Makefile new file mode 100644 index 000000000..75411deb5 --- /dev/null +++ b/lapack/laswp/csky/Makefile @@ -0,0 +1,13 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index 469c38ce3..e048dabe7 100644 --- a/param.h +++ b/param.h @@ -3807,7 +3807,44 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define SYMV_P 16 #endif +#if defined(CSKY) || defined(CK860FV) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif #ifdef GENERIC From ec46ca7a4344dac05f16ce29e3be86368f595d0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 Jan 2024 07:33:10 +0100 Subject: [PATCH 584/718] Support Arm Compiler for Linux as classic flang (#4436) * Support ArmCompilerforLinux as classic flang --- f_check | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/f_check b/f_check index dac34edee..81f598ffa 100755 --- a/f_check +++ b/f_check @@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then pathf90 pathf95 pgf95 pgf90 pgf77 pgfortran nvfortran flang egfortran - ifort nagfor ifx ftn crayftn" + ifort nagfor ifx ftn crayftn armflang" for list in $lists; do for p in $path; do @@ -85,7 +85,11 @@ else *Hewlett*) vendor=CRAY openmp='-fopenmp' - ;; + ;; + *Arm\ F90*) + vendor=FLANG + openmp='-fopenmp' + ;; *GNU*|*GCC*) v="${data#*GCC: *\) }" @@ -108,7 +112,7 @@ else if [ "$major" -ge 17 ]; then vendor=FLANGNEW fi - ;; + ;; *ifort*|*ifx*) vendor=INTEL openmp='-fopenmp' From d2fc4f3b4d7f41527bc7dc8f62e9aa6229cfac89 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 Jan 2024 20:59:24 +0100 Subject: [PATCH 585/718] Increase multithreading threshold by a factor of 50 --- interface/gemv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/gemv.c b/interface/gemv.c index 1f0763579..2c121f130 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP - if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD ) + if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) nthreads = 1; else nthreads = num_cpu_avail(2); From a4992e09bc75cf95ea829c8ce3a8e2ea09d3d1fc Mon Sep 17 00:00:00 2001 From: Pierrick Bouvier Date: Thu, 18 Jan 2024 18:20:37 +0400 Subject: [PATCH 586/718] Fix utest compilation Introduced recently when adding new test cases for ZSCAL - include cblas is needed for cblas_zscal - ASSERT macro does not exist - missing closing ) --- utest/utest_main2.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/utest/utest_main2.c b/utest/utest_main2.c index 8cb663190..46a3b067d 100644 --- a/utest/utest_main2.c +++ b/utest/utest_main2.c @@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CTEST_SEGFAULT #define CTEST_ADD_TESTS_MANUALLY +#include "cblas.h" #include "openblas_utest.h" CTEST(amax, samax){ @@ -623,10 +624,10 @@ CTEST(zscal, i_nan) double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; cblas_zscal(9, i, &nan, 1); - ASSERT(isnan(nan[0]); - ASSERT(isnan(nan[1]); - ASSERT(isnan(nan[16]); - ASSERT(isnan(nan[17]); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); } CTEST(zscal, nan_i) @@ -634,10 +635,10 @@ CTEST(zscal, nan_i) double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; cblas_zscal(9, &nan, &i, 1); - ASSERT(isnan(i[0]); - ASSERT(isnan(i[1]); - ASSERT(isnan(i[16]); - ASSERT(isnan(i[17]); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); } CTEST(zscal, i_inf) @@ -645,10 +646,10 @@ CTEST(zscal, i_inf) double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; cblas_zscal(9, i, &inf, 1); - ASSERT(isnan(inf[0]); - ASSERT(isinf(inf[1]); - ASSERT(isnan(inf[16]); - ASSERT(isinf(inf[17]); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); } CTEST(zscal, inf_i) @@ -656,10 +657,10 @@ CTEST(zscal, inf_i) double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; cblas_zscal(9, &inf, &i, 1); - ASSERT(isnan(i[0]); - ASSERT(isinf(i[1]); - ASSERT(isnan(i[16]); - ASSERT(isinf(i[17]); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); } int main(int argc, const char ** argv){ From b193ea3d7b6ae9950d6c7c73937661318a719c26 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Thu, 18 Jan 2024 22:11:12 +0300 Subject: [PATCH 587/718] Fix BLAS and LAPACK tests for RVV 1.0 target, update to 0.12.0 intrincics * Update intrincics API to 0.12.0 version (Stride Segment Loads/Stores) * Fixed nrm2, axpby, ncopy, zgemv and scal kernels * Added zero size checks --- kernel/riscv64/axpby_rvv.c | 8 +- kernel/riscv64/copy_rvv.c | 2 +- kernel/riscv64/gemm_ncopy_8_rvv.c | 51 ++++- kernel/riscv64/gemm_tcopy_8_rvv.c | 71 +++--- kernel/riscv64/izamax_rvv.c | 23 +- kernel/riscv64/izamin_rvv.c | 23 +- kernel/riscv64/nrm2_rvv.c | 237 +++++++++++++------ kernel/riscv64/scal_rvv.c | 49 ++-- kernel/riscv64/symv_U_rvv.c | 2 +- kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 39 ++-- kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 39 ++-- kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 37 +-- kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 33 ++- kernel/riscv64/zamax_rvv.c | 23 +- kernel/riscv64/zamin_rvv.c | 23 +- kernel/riscv64/zaxpby_rvv.c | 63 ++++-- kernel/riscv64/zaxpy_rvv.c | 76 +++++-- kernel/riscv64/zcopy_rvv.c | 40 ++-- kernel/riscv64/zdot_rvv.c | 49 +++- kernel/riscv64/zgemm_beta_rvv.c | 27 ++- kernel/riscv64/zgemm_ncopy_4_rvv.c | 72 ++++-- kernel/riscv64/zgemm_ncopy_rvv_v1.c | 18 +- kernel/riscv64/zgemm_tcopy_4_rvv.c | 60 +++-- kernel/riscv64/zgemm_tcopy_rvv_v1.c | 18 +- kernel/riscv64/zgemmkernel_rvv_v1x4.c | 144 +++++++++--- kernel/riscv64/zgemv_n_rvv.c | 50 ++-- kernel/riscv64/zgemv_t_rvv.c | 44 ++-- kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 33 ++- kernel/riscv64/zhemm_utcopy_rvv_v1.c | 33 ++- kernel/riscv64/znrm2_rvv.c | 301 +++++++++++++++++++------ kernel/riscv64/zrot_rvv.c | 95 ++++++-- kernel/riscv64/zscal_rvv.c | 66 ++++-- kernel/riscv64/zsum_rvv.c | 23 +- kernel/riscv64/zswap_rvv.c | 62 ++--- kernel/riscv64/zsymm_lcopy_rvv_v1.c | 33 ++- kernel/riscv64/zsymm_ucopy_rvv_v1.c | 33 ++- kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 30 ++- kernel/riscv64/ztrmm_ltcopy_rvv_v1.c | 30 ++- kernel/riscv64/ztrmm_uncopy_rvv_v1.c | 30 ++- kernel/riscv64/ztrmm_utcopy_rvv_v1.c | 31 ++- kernel/riscv64/ztrmmkernel_rvv_v1x4.c | 110 ++++++--- kernel/riscv64/ztrsm_lncopy_rvv_v1.c | 26 +-- kernel/riscv64/ztrsm_ltcopy_rvv_v1.c | 26 +-- kernel/riscv64/ztrsm_uncopy_rvv_v1.c | 26 +-- kernel/riscv64/ztrsm_utcopy_rvv_v1.c | 26 +-- param.h | 2 +- 46 files changed, 1628 insertions(+), 709 deletions(-) diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c index a1dbdb0e4..d7fb86eab 100644 --- a/kernel/riscv64/axpby_rvv.c +++ b/kernel/riscv64/axpby_rvv.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * { FLOAT_V_T vx, vy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( beta == 0.0 ) { if ( alpha == 0.0 ) { @@ -63,7 +63,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG stride_y = inc_y * sizeof(FLOAT); size_t vl = VSETVL(n); vy = VFMVVF_FLOAT(0.0, vl); - for ( ; n > 0; n -= vl, y += vl*stride_y) { + for ( ; n > 0; n -= vl, y += vl*inc_y) { vl = VSETVL(n); VSSEV_FLOAT(y, stride_y, vy, vl); } @@ -126,10 +126,12 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * } else { if ((1 == inc_x) && (1 == inc_y)) { - for (size_t vl; n > 0; n -= vl, y += vl) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { vl = VSETVL(n); + vx = VLEV_FLOAT(x, vl); vy = VLEV_FLOAT(y, vl); vy = VFMULVF_FLOAT(vy, beta, vl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); VSEV_FLOAT (y, vy, vl); } } else if (1 == inc_x) { diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c index 041fd2dae..9d4b84095 100644 --- a/kernel/riscv64/copy_rvv.c +++ b/kernel/riscv64/copy_rvv.c @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - if(n < 0) return(0); + if(n <= 0) return(0); FLOAT_V_T v0; diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c index 3030d67fb..c652ab0c0 100644 --- a/kernel/riscv64/gemm_ncopy_8_rvv.c +++ b/kernel/riscv64/gemm_ncopy_8_rvv.c @@ -30,19 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif // Optimizes the implementation in ../generic/gemm_ncopy_8.c @@ -57,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) FLOAT *b_offset; FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + size_t vl; //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); @@ -87,7 +103,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v7 = VLEV_FLOAT(a_offset7, vl); v8 = VLEV_FLOAT(a_offset8, vl); - VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl); + vx8 = VSET_VX8(vx8, 0, v1); + vx8 = VSET_VX8(vx8, 1, v2); + vx8 = VSET_VX8(vx8, 2, v3); + vx8 = VSET_VX8(vx8, 3, v4); + vx8 = VSET_VX8(vx8, 4, v5); + vx8 = VSET_VX8(vx8, 5, v6); + vx8 = VSET_VX8(vx8, 6, v7); + vx8 = VSET_VX8(vx8, 7, v8); + + VSSEG8_FLOAT(b_offset, vx8, vl); a_offset1 += vl; a_offset2 += vl; @@ -116,7 +141,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v3 = VLEV_FLOAT(a_offset3, vl); v4 = VLEV_FLOAT(a_offset4, vl); - VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); + vx4 = VSET_VX4(vx4, 0, v1); + vx4 = VSET_VX4(vx4, 1, v2); + vx4 = VSET_VX4(vx4, 2, v3); + vx4 = VSET_VX4(vx4, 3, v4); + + VSSEG4_FLOAT(b_offset, vx4, vl); a_offset1 += vl; a_offset2 += vl; @@ -137,7 +167,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) v1 = VLEV_FLOAT(a_offset1, vl); v2 = VLEV_FLOAT(a_offset2, vl); - VSSEG2_FLOAT(b_offset, v1, v2, vl); + vx2 = VSET_VX2(vx2, 0, v1); + vx2 = VSET_VX2(vx2, 1, v2); + + VSSEG2_FLOAT(b_offset, vx2, vl); a_offset1 += vl; a_offset2 += vl; diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c index 080a87312..4742ae6a7 100644 --- a/kernel/riscv64/gemm_tcopy_8_rvv.c +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VLSEV_FLOAT __riscv_vlse32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VLSEV_FLOAT __riscv_vlse64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) @@ -62,7 +68,10 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; - FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); @@ -83,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 8; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -93,8 +102,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 8; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 32; @@ -103,8 +112,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 8; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 16; @@ -133,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 4; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -143,8 +152,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 4; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 16; @@ -153,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 4; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 8; @@ -181,8 +190,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) for(i = (n >> 3); i > 0; i--) { size_t vl = 2; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -191,8 +200,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 2; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 8; @@ -201,8 +210,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 2) { size_t vl = 2; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 4; diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c index e93f0056c..32f66a7a7 100644 --- a/kernel/riscv64/izamax_rvv.c +++ b/kernel/riscv64/izamax_rvv.c @@ -32,10 +32,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define MASK_T vbool16_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 @@ -61,10 +63,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 @@ -93,6 +97,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(max_index); FLOAT_V_T vx0, vx1, v_max; + FLOAT_VX2_T vxx2; UINT_V_T v_max_index; MASK_T mask; @@ -107,7 +112,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); @@ -129,7 +137,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c index b5bc27404..d34b220fa 100644 --- a/kernel/riscv64/izamin_rvv.c +++ b/kernel/riscv64/izamin_rvv.c @@ -33,8 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define MASK_T vbool16_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 @@ -60,8 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define MASK_T vbool8_t #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 @@ -90,6 +94,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(min_index); FLOAT_V_T vx0, vx1, v_min; + FLOAT_VX2_T vxx2; UINT_V_T v_min_index; MASK_T mask; @@ -104,7 +109,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); + vxx2 = VLSEG_FLOAT(x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); @@ -127,7 +135,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vx0 = VFABSV_FLOAT(vx0, vl); vx1 = VFABSV_FLOAT(vx1, vl); diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 994fadb70..3eb423849 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -26,78 +26,187 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include - -#if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m8() -#define FLOAT_V_T vfloat32m8_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT __riscv_vle32_v_f32m8 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define ABS fabsf -#else -#define VSETVL(n) __riscv_vsetvl_e64m8(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m8() -#define FLOAT_V_T vfloat64m8_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m8 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 + +#if defined(DOUBLE) +#define VSETVL __riscv_vsetvl_e64m4 +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVSF_FLOAT __riscv_vfmv_s_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define MASK_T vbool16_t +#define VFABS __riscv_vfabs_v_f64m4 +#define VMFNE __riscv_vmfne_vf_f64m4_b16 +#define VMFGT __riscv_vmfgt_vv_f64m4_b16 +#define VMFEQ __riscv_vmfeq_vf_f64m4_b16 +#define VCPOP __riscv_vcpop_m_b16 +#define VFREDMAX __riscv_vfredmax_vs_f64m4_f64m1 +#define VFREDMIN __riscv_vfredmin_vs_f64m4_f64m1 +#define VFIRST __riscv_vfirst_m_b16 +#define VRGATHER __riscv_vrgather_vx_f64m4 +#define VFDIV __riscv_vfdiv_vv_f64m4 +#define VFDIV_M __riscv_vfdiv_vv_f64m4_mu +#define VFMUL __riscv_vfmul_vv_f64m4 +#define VFMUL_M __riscv_vfmul_vv_f64m4_mu +#define VFMACC __riscv_vfmacc_vv_f64m4 +#define VFMACC_M __riscv_vfmacc_vv_f64m4_mu +#define VMSBF __riscv_vmsbf_m_b16 +#define VMSOF __riscv_vmsof_m_b16 +#define VMAND __riscv_vmand_mm_b16 +#define VMANDN __riscv_vmand_mm_b16 +#define VFREDSUM __riscv_vfredusum_vs_f64m4_f64m1 +#define VMERGE __riscv_vmerge_vvm_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v) #define ABS fabs +#else +#define VSETVL __riscv_vsetvl_e32m4 +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVSF_FLOAT __riscv_vfmv_s_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define MASK_T vbool8_t +#define VFABS __riscv_vfabs_v_f32m4 +#define VMFNE __riscv_vmfne_vf_f32m4_b8 +#define VMFGT __riscv_vmfgt_vv_f32m4_b8 +#define VMFEQ __riscv_vmfeq_vf_f32m4_b8 +#define VCPOP __riscv_vcpop_m_b8 +#define VFREDMAX __riscv_vfredmax_vs_f32m4_f32m1 +#define VFREDMIN __riscv_vfredmin_vs_f32m4_f32m1 +#define VFIRST __riscv_vfirst_m_b8 +#define VRGATHER __riscv_vrgather_vx_f32m4 +#define VFDIV __riscv_vfdiv_vv_f32m4 +#define VFDIV_M __riscv_vfdiv_vv_f32m4_mu +#define VFMUL __riscv_vfmul_vv_f32m4 +#define VFMUL_M __riscv_vfmul_vv_f32m4_mu +#define VFMACC __riscv_vfmacc_vv_f32m4 +#define VFMACC_M __riscv_vfmacc_vv_f32m4_mu +#define VMSBF __riscv_vmsbf_m_b8 +#define VMSOF __riscv_vmsof_m_b8 +#define VMAND __riscv_vmand_mm_b8 +#define VMANDN __riscv_vmand_mm_b8 +#define VFREDSUM __riscv_vfredusum_vs_f32m4_f32m1 +#define VMERGE __riscv_vmerge_vvm_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v) +#define ABS fabsf #endif - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - - if( n <= 0 ) return(0.0); - if(n == 1) return (ABS(x[0])); - - FLOAT_V_T vr, v0; - FLOAT_V_T_M1 v_res; - FLOAT ssq = 0.0; - - size_t vlmax = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - - vr = VFMVVF_FLOAT(0, vlmax); - - if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl) { - vl = VSETVL(n); - - v0 = VLEV_FLOAT(x, vl); - - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); + BLASLONG i=0; + + if (n <= 0 || inc_x <= 0) return(0.0); + if(n == 1) return (ABS(x[0])); + + unsigned int gvl = 0; + + MASK_T nonzero_mask; + MASK_T scale_mask; + + gvl = VSETVL(n); + FLOAT_V_T v0; + FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl); + FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl); + + FLOAT scale = 0; + FLOAT ssq = 0; + unsigned int stride_x = inc_x * sizeof(FLOAT); + int idx = 0; + + if( n >= gvl ) // don't pay overheads if we're not doing useful work + { + for(i=0; i 0; n -= vl, x += vl * inc_x) { - vl = VSETVL(n); - - v0 = VLSEV_FLOAT(x, stride_x, vl); - - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); + //finish any tail using scalar ops + i*=gvl*inc_x; + n*=inc_x; + while(i < n){ + if ( x[i] != 0.0 ){ + FLOAT absxi = ABS( x[i] ); + if ( scale < absxi ){ + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else{ + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + + i += inc_x; } - } - v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); + return(scale * sqrt(ssq)); +} - ssq = VFMVFS_FLOAT_M1(v_res); - return sqrt(ssq); -} diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c index 2e2cfd31e..2c273fb63 100644 --- a/kernel/riscv64/scal_rvv.c +++ b/kernel/riscv64/scal_rvv.c @@ -29,6 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() #define FLOAT_V_T vfloat32m8_t #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 @@ -38,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() #define FLOAT_V_T vfloat64m8_t #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 @@ -54,26 +56,41 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T v0; if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl) { - vl = VSETVL(n); - - v0 = VLEV_FLOAT(x, vl); - v0 = VFMULVF_FLOAT(v0, da, vl); - VSEV_FLOAT(x, v0, vl); + if(da == 0.0) { + int gvl = VSETVL_MAX; + v0 = VFMVVF_FLOAT(0.0, gvl); + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + VSEV_FLOAT(x, v0, vl); + } } - - } else { + else { + for (size_t vl; n > 0; n -= vl, x += vl) { + vl = VSETVL(n); + v0 = VLEV_FLOAT(x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSEV_FLOAT(x, v0, vl); + } + } + } else { BLASLONG stride_x = inc_x * sizeof(FLOAT); - for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { - vl = VSETVL(n); - - v0 = VLSEV_FLOAT(x, stride_x, vl); - v0 = VFMULVF_FLOAT(v0, da, vl); - VSSEV_FLOAT(x, stride_x, v0, vl); + if(da == 0.0) { + int gvl = VSETVL_MAX; + v0 = VFMVVF_FLOAT(0.0, gvl); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + VSSEV_FLOAT(x, stride_x, v0, vl); + } + } + else { + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + v0 = VLSEV_FLOAT(x, stride_x, vl); + v0 = VFMULVF_FLOAT(v0, da, vl); + VSSEV_FLOAT(x, stride_x, v0, vl); + } } - } return 0; diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c index 3cfd3ee4c..bcd2f6981 100644 --- a/kernel/riscv64/symv_U_rvv.c +++ b/kernel/riscv64/symv_U_rvv.c @@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT_V_T va, vx, vy, vr; BLASLONG stride_x, stride_y, inc_xv, inc_yv; - + BLASLONG m1 = m - offset; if(inc_x == 1 && inc_y == 1) { diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c index 886af0c3b..869561fb3 100644 --- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c @@ -31,13 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -140,6 +144,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + FLOAT_VX2_T vbx2, vsx2, vcx2; FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; size_t vl; a += (m - 1) * m * 2; @@ -153,7 +158,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = n; j > 0; j -= vl) { vl = VSETVL(j); - VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(vb1, aa1, vl); vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); @@ -165,12 +172,16 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(vb2, aa1, vl); vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif - VSSEG2_FLOAT(b, vs1, vs2, vl); - VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); b += vl * 2; for (k = 0; k < i; k ++) { - VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); @@ -182,7 +193,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); } pc += vl * ldc * 2; } diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c index ddeef966c..da443cfba 100644 --- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c @@ -31,13 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -137,6 +141,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2; + FLOAT_VX2_T vbx2, vsx2, vcx2; FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2; size_t vl; @@ -149,7 +154,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = n; j > 0; j -= vl) { vl = VSETVL(j); - VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl); + vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl); + vb1 = VGET_VX2(vbx2, 0); + vb2 = VGET_VX2(vbx2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(vb1, aa1, vl); vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl); @@ -161,12 +168,16 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(vb2, aa1, vl); vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl); #endif - VSSEG2_FLOAT(b, vs1, vs2, vl); - VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(b, vsx2, vl); + VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl); b += vl * 2; for (k = i + 1; k < m; k++) { - VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl); + vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl); @@ -178,7 +189,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl); #endif - VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl); } pc += vl * ldc * 2; } diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c index 4c83bbaa3..32e481036 100644 --- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c @@ -31,13 +31,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSSEV_FLOAT __riscv_vsse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -45,13 +46,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSSEV_FLOAT __riscv_vsse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSSEG2_FLOAT __riscv_vssseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -133,6 +135,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B int i, j, k; + FLOAT_VX2_T vax2, vsx2, vcx2; FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; size_t vl; @@ -147,7 +150,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (j = m; j > 0; j -= vl) { vl = VSETVL(j); - VLSEG2_FLOAT(&va1, &va2, pci, vl); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(va1, bb1, vl); vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); @@ -159,13 +164,17 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(va2, bb1, vl); vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif - VSSEG2_FLOAT(a, vs1, vs2, vl); - VSSEG2_FLOAT(pci, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); a += vl * 2; pci += vl * 2; for (k = i + 1; k < n; k ++){ - VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); @@ -177,7 +186,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); } pcj += vl * 2; } diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c index b368eefb9..81cc41818 100644 --- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c +++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c @@ -31,10 +31,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m2 @@ -42,10 +45,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -133,6 +139,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B int i, j, k; + FLOAT_VX2_T vax2, vsx2, vcx2; FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2; size_t vl; @@ -149,7 +156,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B pcj = c; for (j = m; j > 0; j -= vl) { vl = VSETVL(j); - VLSEG2_FLOAT(&va1, &va2, pci, vl); + vax2 = VLSEG2_FLOAT(pci, vl); + va1 = VGET_VX2(vax2, 0); + va2 = VGET_VX2(vax2, 1); #ifndef CONJ vs1 = VFMULVF_FLOAT(va1, bb1, vl); vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl); @@ -161,13 +170,17 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vs2 = VFMULVF_FLOAT(va2, bb1, vl); vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl); #endif - VSSEG2_FLOAT(a, vs1, vs2, vl); - VSSEG2_FLOAT(pci, vs1, vs2, vl); + vsx2 = VSET_VX2(vsx2, 0, vs1); + vsx2 = VSET_VX2(vsx2, 1, vs2); + VSSEG2_FLOAT(a, vsx2, vl); + VSSEG2_FLOAT(pci, vsx2, vl); a += vl * 2; pci += vl * 2; for (k = 0; k < i; k ++){ - VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl); + vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl); + vc1 = VGET_VX2(vcx2, 0); + vc2 = VGET_VX2(vcx2, 1); #ifndef CONJ vc1 = VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl); vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl); @@ -179,7 +192,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B vc2 = VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl); vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl); #endif - VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl); + vcx2 = VSET_VX2(vcx2, 0, vc1); + vcx2 = VSET_VX2(vcx2, 1, vc2); + VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl); } pcj += vl * 2; } diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c index bbb1e876b..180cf059a 100644 --- a/kernel/riscv64/zamax_rvv.c +++ b/kernel/riscv64/zamax_rvv.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -68,6 +72,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T v0, v1, vmax; FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); size_t vlmax = VSETVL_MAX; @@ -78,7 +83,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); @@ -95,7 +103,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c index c5453121b..56a467502 100644 --- a/kernel/riscv64/zamin_rvv.c +++ b/kernel/riscv64/zamin_rvv.c @@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -68,6 +72,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T v0, v1, vmin; FLOAT_V_T_M1 v_res; + FLOAT_VX2_T vx2; v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); size_t vlmax = VSETVL_MAX; @@ -78,7 +83,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); @@ -94,7 +102,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v0 = VFABSV_FLOAT(v0, vl); v1 = VFABSV_FLOAT(v1, vl); diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c index e0da55311..66e38c1e4 100644 --- a/kernel/riscv64/zaxpby_rvv.c +++ b/kernel/riscv64/zaxpby_rvv.c @@ -35,6 +35,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 @@ -42,13 +45,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 @@ -56,10 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) @@ -74,6 +80,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL BLASLONG stride_x = inc_x2 * sizeof(FLOAT); BLASLONG stride_y = inc_y2 * sizeof(FLOAT); FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if ( beta_r == 0.0 && beta_i == 0.0) { @@ -81,10 +88,12 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL { size_t vl = VSETVL(n); FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl); - for ( ; n > 0; n -= vl, y += vl*stride_y) + vxx2 = VSET_VX2(vxx2, 0, temp); + vxx2 = VSET_VX2(vxx2, 1, temp); + for ( ; n > 0; n -= vl, y += vl*inc_y2) { vl = VSETVL(n); - VSSSEG_FLOAT(y, stride_y, temp, temp, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); } } else @@ -92,7 +101,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl); vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl); @@ -100,20 +112,26 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl); vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } } else { FLOAT_V_T v0, v1; + FLOAT_VX2_T v_x2; if ( alpha_r == 0.0 && alpha_i == 0.0 ) { for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); v0 = VFMULVF_FLOAT(vy1, beta_i, vl); v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl); @@ -121,7 +139,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL v1 = VFMULVF_FLOAT(vy1, beta_r, vl); v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); - VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + VSSSEG_FLOAT(y, stride_y, v_x2, vl); } } else @@ -129,8 +149,14 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); v0 = VFMULVF_FLOAT(vx0, alpha_r, vl); v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl); @@ -142,7 +168,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl); v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl); - VSSSEG_FLOAT(y, stride_y, v0, v1, vl); + v_x2 = VSET_VX2(v_x2, 0, v0); + v_x2 = VSET_VX2(v_x2, 1, v1); + + VSSSEG_FLOAT(y, stride_y, v_x2, vl); } } } diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c index 3f75898e0..0db32df10 100644 --- a/kernel/riscv64/zaxpy_rvv.c +++ b/kernel/riscv64/zaxpy_rvv.c @@ -30,19 +30,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif @@ -53,14 +59,21 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if(da_r == 0.0 && da_i == 0.0) return(0); FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if(inc_x == 1 && inc_y == 1) { for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); + #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl); @@ -72,7 +85,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSEG_FLOAT(y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } else if (inc_x == 1) { @@ -82,8 +97,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -96,7 +116,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } else if (inc_y == 1) { @@ -106,8 +128,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -120,7 +147,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSEG_FLOAT(y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } else { @@ -131,8 +160,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); #if !defined(CONJ) vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl); @@ -145,7 +179,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl); vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl); #endif - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c index bd94810ce..13879f03b 100644 --- a/kernel/riscv64/zcopy_rvv.c +++ b/kernel/riscv64/zcopy_rvv.c @@ -34,11 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT_M8 __riscv_vse32_v_f32m8 #define VSETVL_M4(n) __riscv_vsetvl_e32m4(n) -#define FLOAT_V_T_M4 vfloat32m4_t -#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T_M4 vfloat32m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL_M8(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T_M8 vfloat64m8_t @@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT_M8 __riscv_vse64_v_f64m8 #define VSETVL_M4(n) __riscv_vsetvl_e64m4(n) -#define FLOAT_V_T_M4 vfloat64m4_t -#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T_M4 vfloat64m4x2_t +#define VLSEG_FLOAT_M4 __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT_M4 __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT_M4 __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT_M4 __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - if(n < 0) return(0); + if(n <= 0) return(0); if(inc_x == 1 && inc_y == 1) { @@ -70,34 +70,34 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) }else if (1 == inc_x) { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL_M4(n); - VLSEG_FLOAT_M4(&vr, &vi, x, vl); - VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + vx2 = VLSEG_FLOAT_M4(x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); } } else if (1 == inc_y) { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL_M4(n); - VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); - VSSEG_FLOAT_M4(y, vr, vi, vl); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSEG_FLOAT_M4(y, vx2, vl); } } else { - FLOAT_V_T_M4 vr, vi; + FLOAT_VX2_T_M4 vx2; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL_M4(n); - VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl); - VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl); + vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl); + VSSSEG_FLOAT_M4(y, stride_y, vx2, vl); } } diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c index fa0e89353..13bc2ee39 100644 --- a/kernel/riscv64/zdot_rvv.c +++ b/kernel/riscv64/zdot_rvv.c @@ -33,8 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -71,6 +75,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vyx2; size_t vlmax_m1 = VSETVL_MAX_M1; v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1); @@ -83,8 +88,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -104,8 +114,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -124,8 +139,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); @@ -145,8 +165,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl); vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl); diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c index b94b5f4bf..ee334801b 100644 --- a/kernel/riscv64/zgemm_beta_rvv.c +++ b/kernel/riscv64/zgemm_beta_rvv.c @@ -41,8 +41,11 @@ #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 @@ -50,8 +53,11 @@ #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 @@ -68,6 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT *c_offset; size_t vl; FLOAT_V_T vr, vi, v1, v2, v3, v4; + FLOAT_VX2_T vx2; ldc *= 2; c_offset = c; @@ -77,6 +84,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, vl = VSETVL(m); vr = VFMVVF_FLOAT(0.0, vl); vi = VFMVVF_FLOAT(0.0, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); for( ; n > 0; n--, c += ldc) { c_offset = c; @@ -84,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { vl = VSETVL(chunk); - VSSEG_FLOAT(c_offset, vr, vi, vl); + VSSEG_FLOAT(c_offset, vx2, vl); } } @@ -96,7 +105,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) { vl = VSETVL(chunk); - VLSEG_FLOAT(&vr, &vi, c_offset, vl); + vx2 = VLSEG_FLOAT(c_offset, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); v1 = VFMULVF_FLOAT(vr, beta_r, vl); v2 = VFMULVF_FLOAT(vi, beta_i, vl); @@ -107,7 +118,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, vr = VFSUBVV_FLOAT(v1, v2, vl); vi = VFADDVV_FLOAT(v3, v4, vl); - VSSEG_FLOAT(c_offset, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(c_offset, vx2, vl); } } diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c index d50a4b8d5..dce98752e 100644 --- a/kernel/riscv64/zgemm_ncopy_4_rvv.c +++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c @@ -29,18 +29,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t +#define VGET_VX2 __riscv_vget_v_f32m1x2_f32m1 +#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 +#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 +#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t +#define VGET_VX2 __riscv_vget_v_f64m1x2_f64m1 +#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 +#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 +#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m1x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif // Optimizes the implementation in ../generic/zgemm_ncopy_4.c @@ -53,7 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ FLOAT *boffset; - FLOAT_V_T v11, v12, v21, v22, v31, v32, v41, v42; + FLOAT_VX2_T v1x2, v2x2, v3x2, v4x2; + FLOAT_VX4_T vxx4; + FLOAT_VX8_T vxx8; size_t vl; aoffset = a; @@ -69,12 +83,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); - VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); - VLSEG2_FLOAT(&v31, &v32, aoffset3, vl); - VLSEG2_FLOAT(&v41, &v42, aoffset4, vl); - - VSSEG8_FLOAT(boffset, v11, v12, v21, v22, v31, v32, v41, v42, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + v3x2 = VLSEG2_FLOAT(aoffset3, vl); + v4x2 = VLSEG2_FLOAT(aoffset4, vl); + + vxx8 = VSET_VX8(vxx8, 0, VGET_VX2(v1x2, 0)); + vxx8 = VSET_VX8(vxx8, 1, VGET_VX2(v1x2, 1)); + vxx8 = VSET_VX8(vxx8, 2, VGET_VX2(v2x2, 0)); + vxx8 = VSET_VX8(vxx8, 3, VGET_VX2(v2x2, 1)); + vxx8 = VSET_VX8(vxx8, 4, VGET_VX2(v3x2, 0)); + vxx8 = VSET_VX8(vxx8, 5, VGET_VX2(v3x2, 1)); + vxx8 = VSET_VX8(vxx8, 6, VGET_VX2(v4x2, 0)); + vxx8 = VSET_VX8(vxx8, 7, VGET_VX2(v4x2, 1)); + + VSSEG8_FLOAT(boffset, vxx8, vl); aoffset1 += vl * 2; aoffset2 += vl * 2; @@ -91,10 +114,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); - VLSEG2_FLOAT(&v21, &v22, aoffset2, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); + v2x2 = VLSEG2_FLOAT(aoffset2, vl); + + vxx4 = VSET_VX4(vxx4, 0, VGET_VX2(v1x2, 0)); + vxx4 = VSET_VX4(vxx4, 1, VGET_VX2(v1x2, 1)); + vxx4 = VSET_VX4(vxx4, 2, VGET_VX2(v2x2, 0)); + vxx4 = VSET_VX4(vxx4, 3, VGET_VX2(v2x2, 1)); - VSSEG4_FLOAT(boffset, v11, v12, v21, v22, vl); + VSSEG4_FLOAT(boffset, vxx4, vl); aoffset1 += vl * 2; aoffset2 += vl * 2; @@ -108,9 +136,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for (i = m; i > 0; i -= vl) { vl = VSETVL(i); - VLSEG2_FLOAT(&v11, &v12, aoffset1, vl); + v1x2 = VLSEG2_FLOAT(aoffset1, vl); - VSSEG2_FLOAT(boffset, v11, v12, vl); + VSSEG2_FLOAT(boffset, v1x2, vl); aoffset1 += vl * 2; boffset += vl * 2; diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c index 1d3b8d3b7..275daa5f2 100644 --- a/kernel/riscv64/zgemm_ncopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c @@ -30,14 +30,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ @@ -48,7 +48,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ FLOAT *a_offset1; FLOAT *b_offset; - FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vl; //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -62,8 +62,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ a_offset += vl * lda * 2; for(i = m; i > 0; i--) { - VLSSEG2_FLOAT(&v0, &v1, a_offset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(b_offset, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(b_offset, vx2, vl); a_offset1 += 2; b_offset += vl * 2; diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c index 8c35b5616..cfafbf0dc 100644 --- a/kernel/riscv64/zgemm_tcopy_4_rvv.c +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -30,25 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m1(n) #define FLOAT_V_T vfloat32m1_t +#define FLOAT_VX2_T vfloat32m1x2_t +#define FLOAT_VX4_T vfloat32m1x4_t +#define FLOAT_VX8_T vfloat32m1x8_t #define VLEV_FLOAT __riscv_vle32_v_f32m1 #define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 #else #define VSETVL(n) __riscv_vsetvl_e64m1(n) #define FLOAT_V_T vfloat64m1_t +#define FLOAT_VX2_T vfloat64m1x2_t +#define FLOAT_VX4_T vfloat64m1x4_t +#define FLOAT_VX8_T vfloat64m1x8_t #define VLEV_FLOAT __riscv_vle64_v_f64m1 #define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 +#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 +#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 +#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 +#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ @@ -60,7 +66,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ IFLOAT *boffset, *boffset1, *boffset2, *boffset3; - FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7; + FLOAT_V_T v0; + FLOAT_VX2_T vx2; + FLOAT_VX4_T vx4; + FLOAT_VX8_T vx8; + size_t vl; //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -81,8 +91,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for(i = (n >> 2); i > 0; i--) { vl = 4; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -91,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { vl = 4; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 16; @@ -101,8 +111,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1) { vl = 4; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); aoffset1 += 2; boffset3 += 8; @@ -119,8 +129,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ for(i = (n >> 2); i > 0; i--) { vl = 2; - VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl); + vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG8_FLOAT(boffset1, vx8, vl); aoffset1 += 8; boffset1 += m * 8; @@ -129,8 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { vl = 2; - VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl); + vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG4_FLOAT(boffset2, vx4, vl); aoffset1 += 4; boffset2 += 8; @@ -139,8 +149,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1) { vl = 2; - VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(boffset3, v0, v1, vl); + vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); + VSSEG2_FLOAT(boffset3, vx2, vl); //aoffset1 += 2; boffset3 += 4; diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c index 7a085269c..96e986502 100644 --- a/kernel/riscv64/zgemm_tcopy_rvv_v1.c +++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c @@ -29,14 +29,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) @@ -47,7 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) IFLOAT *aoffset1; IFLOAT *boffset; - FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vl; //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); @@ -62,8 +62,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) aoffset += vl * 2; for(i = m; i > 0; i--) { - VLSEG2_FLOAT(&v0, &v1, aoffset1, vl); - VSSEG2_FLOAT(boffset, v0, v1, vl); + vx2 = VLSEG2_FLOAT(aoffset1, vl); + VSSEG2_FLOAT(boffset, vx2, vl); aoffset1 += lda * 2; boffset += vl * 2; diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c index 41399cf79..77e012ff5 100644 --- a/kernel/riscv64/zgemmkernel_rvv_v1x4.c +++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c @@ -30,20 +30,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -80,6 +86,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b BLASLONG i,j,k; FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; @@ -109,10 +116,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -137,7 +148,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -162,7 +175,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -211,7 +226,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -237,35 +254,57 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); - VLSEG2_FLOAT(&va2, &va3, C1, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); - VLSEG2_FLOAT(&va0, &va1, C2, vl); - VLSEG2_FLOAT(&va2, &va3, C3, vl); + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); + + vax2 = VLSEG2_FLOAT(C2, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C3, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres4, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres5, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); - VSSEG2_FLOAT(C2, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres6, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres7, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); - VSSEG2_FLOAT(C3, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); C0 += vl * 2; C1 += vl * 2; @@ -294,9 +333,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -311,7 +355,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -326,7 +372,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -356,7 +404,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -372,20 +422,31 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); - VLSEG2_FLOAT(&va2, &va3, C1, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + + vax2 = VLSEG2_FLOAT(C1, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMACCVF_FLOAT(va2, alphar, vres2, vl); va3 = VFMACCVF_FLOAT(va3, alphar, vres3, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); C0 += vl * 2; C1 += vl * 2; @@ -409,9 +470,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = bk/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -420,7 +486,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b vres1 = OP_ri(vres1, *(ptrbb + 1), va0, vl); ptrbb += 2; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -430,7 +498,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -448,7 +518,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = (bk & 3); k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -458,12 +530,18 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; } - VLSEG2_FLOAT(&va0, &va1, C0, vl); + vax2 = VLSEG2_FLOAT(C0, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + va0 = VFMACCVF_FLOAT(va0, alphar, vres0, vl); va1 = VFMACCVF_FLOAT(va1, alphar, vres1, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); C0 += vl * 2; } diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c index 4a40c30a7..f14ef5ba8 100644 --- a/kernel/riscv64/zgemv_n_rvv.c +++ b/kernel/riscv64/zgemv_n_rvv.c @@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSEV_FLOAT __riscv_vse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSEV_FLOAT __riscv_vse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 #endif @@ -62,6 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a_ptr; FLOAT temp_r, temp_i; FLOAT_V_T va0, va1, vy0, vy1; + FLOAT_VX2_T vax2, vyx2; BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; @@ -73,7 +80,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vl = VSETVL(m); a_ptr = a; ix = 0; - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); for(i = 0; i < n; i++){ #if !defined(XCONJ) @@ -84,7 +94,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; #endif - VLSEG_FLOAT(&va0, &va1, a_ptr, vl); + vax2 = VLSEG_FLOAT(a_ptr, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); #if !defined(CONJ) #if !defined(XCONJ) vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); @@ -113,7 +126,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, a_ptr += lda2; ix += inc_x2; } - VSSEG_FLOAT(y, vy0, vy1, vl); + + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSEG_FLOAT(y, vyx2, vl); } } @@ -123,7 +139,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, vl = VSETVL(m); a_ptr = a; ix = 0; - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); for(i = 0; i < n; i++){ #if !defined(XCONJ) @@ -134,7 +152,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; #endif - VLSEG_FLOAT(&va0, &va1, a_ptr, vl); + vax2 = VLSEG_FLOAT(a_ptr, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); #if !defined(CONJ) #if !defined(XCONJ) vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl); @@ -163,7 +183,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, a_ptr += lda2; ix += inc_x2; } - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } return(0); diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c index 2f0380530..1c89a9f72 100644 --- a/kernel/riscv64/zgemv_t_rvv.c +++ b/kernel/riscv64/zgemv_t_rvv.c @@ -32,9 +32,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f32m4_f32m1_tu #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 @@ -46,9 +48,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VFREDSUM_FLOAT_TU __riscv_vfredusum_vs_f64m4_f64m1_tu #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 @@ -66,6 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT_V_T va0, va1, vx0, vx1, vr, vi; FLOAT_V_T_M1 v_res, v_z0; + FLOAT_VX2_T vxx2, vax2; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; //BLASLONG stride_a = sizeof(FLOAT) * 2; @@ -73,6 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2 = lda * 2; size_t vlmax = VSETVL_MAX_M1; + v_res = VFMVVF_FLOAT_M1(0, vlmax); v_z0 = VFMVVF_FLOAT_M1(0, vlmax); vlmax = VSETVL(m); @@ -86,8 +92,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for(size_t vl, k = m; k > 0; k -= vl) { vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); - VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl); + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSEG_FLOAT(&x[ix], vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); @@ -104,9 +115,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) @@ -130,8 +141,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, for(size_t vl, k = m; k > 0; k -= vl) { vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl); - VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl); + vax2 = VLSEG_FLOAT(&a_ptr[j], vl); + vxx2 = VLSSEG_FLOAT(&x[ix], stride_x, vl); + + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl); @@ -148,9 +164,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, ix += vl * inc_x * 2; } - v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax); temp_r = VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax); + v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax); temp_i = VFMVFS_FLOAT_M1(v_res); #if !defined(XCONJ) diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c index 79b20a646..97013895a 100644 --- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -51,12 +54,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -81,6 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0; INT_V_T vindex_max, vindex; @@ -96,8 +103,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posY * 2 + posX * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); - VLSEG2_FLOAT(&va10, &va11, ao1, vl); + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); @@ -111,7 +123,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl); vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += lda * 2; diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c index a86815275..59029e9e5 100644 --- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -51,12 +54,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -79,6 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool_gt0, vbool_eq0; INT_V_T vindex_max, vindex; @@ -94,8 +101,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posX * 2 + posY * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); - VLSEG2_FLOAT(&va20, &va21, ao2, vl); + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool_gt0 = VMSGT_VX_INT(vindex, 0, vl); @@ -108,7 +120,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vb1 = VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl); vb1 = VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += 2; diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c index d2b27aa8d..32f67758a 100644 --- a/kernel/riscv64/znrm2_rvv.c +++ b/kernel/riscv64/znrm2_rvv.c @@ -28,95 +28,248 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e32m4() -#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 -#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define MASK_T vbool8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 +#define VMFIRSTM __riscv_vfirst_m_b8 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f32m4_b8 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #else -#define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define VSETVL_MAX __riscv_vsetvlmax_e64m4() -#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 -#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu -#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 -#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define MASK_T vbool16_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VMFIRSTM __riscv_vfirst_m_b16 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f64m4_b16 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 #endif -// TODO: Should single precision use the widening MAC, or perhaps all should be double? - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i=0, j=0; - if ( n <= 0 ) return(0.0); - - FLOAT_V_T vr, v0, v1; - FLOAT_V_T_M1 v_max, v_res; - FLOAT scale = 0.0, ssq = 0.0; - - size_t vlmax = VSETVL_MAX; - v_res = VFMVVF_FLOAT_M1(0, vlmax); - v_max = VFMVVF_FLOAT_M1(0, vlmax); + if (n <= 0 || inc_x <= 0) return(0.0); - vr = VFMVVF_FLOAT(0, vlmax); + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; if (inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - - VLSEG_FLOAT(&v0, &v1, x, vl); - v0 = VFABSV_FLOAT(v0, vl); - v1 = VFABSV_FLOAT(v1, vl); - - v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); - - v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); + BLASLONG n2 = n * 2; + gvl = VSETVL(n2); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for (i=0,j=0; i 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); - v0 = VFABSV_FLOAT(v0, vl); - v1 = VFABSV_FLOAT(v1, vl); + v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) { // no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl); + } + } else { // found greater element + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq before current vector + ssq += VFMVFS_FLOAT(v_res); + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + //ssq in vector vr + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + j += gvl; + idx += inc_v; + } + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq now + ssq += VFMVFS_FLOAT(v_res); - v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl); + //tail + if (j < n) { + gvl = VSETVL(n-j); + v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if(index == -1) { // no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + } else { // found greater element + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } - v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl); - vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl); + v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); + //fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + //if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) {//no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl); + } + } else { // found greater element + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq before current vector + ssq += VFMVFS_FLOAT(v_res); + //find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + //update ssq before max_index + ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); + //update scale + scale = VFMVFS_FLOAT(v_res); + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + //ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + //total ssq now + ssq += VFMVFS_FLOAT(v_res); } - } - - v_res = VFREDSUM_FLOAT(vr, v_res, vlmax); - - ssq = VFMVFS_FLOAT_M1(v_res); - scale = VFMVFS_FLOAT_M1(v_max); - ssq = ssq / (scale*scale); - - return(scale * sqrt(ssq)); + return(scale * sqrt(ssq)); } diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c index ee81bfe91..1d5390684 100644 --- a/kernel/riscv64/zrot_rvv.c +++ b/kernel/riscv64/zrot_rvv.c @@ -30,28 +30,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define FLOAT_V_T vfloat32m4_t +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 #define VSEV_FLOAT __riscv_vse32_v_f32m4 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4 -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define FLOAT_V_T vfloat64m4_t +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 #define VLEV_FLOAT __riscv_vle64_v_f64m4 #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 #define VSEV_FLOAT __riscv_vse64_v_f64m4 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4 -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 @@ -63,6 +69,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if (n <= 0) return(0); FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2, vtx2; if (inc_x == 0 && inc_y == 0) { BLASLONG i=0; @@ -93,8 +100,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -105,8 +117,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSEG_FLOAT(x, vt0, vt1, vl); - VSSEG_FLOAT(y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); } } else if (inc_x == 1){ @@ -115,8 +132,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -127,8 +149,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSEG_FLOAT(x, vt0, vt1, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSEG_FLOAT(x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } else if (inc_y == 1){ @@ -137,8 +164,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -149,8 +181,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); - VSSEG_FLOAT(y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSEG_FLOAT(y, vyx2, vl); } } else { @@ -160,8 +197,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); + + vx0 = VGET_VX2(vxx2, 0); + vx1 = VGET_VX2(vxx2, 1); + vy0 = VGET_VX2(vyx2, 0); + vy1 = VGET_VX2(vyx2, 1); vt0 = VFMULVF_FLOAT(vx0, c, vl); vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl); @@ -172,8 +214,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT vy1 = VFMULVF_FLOAT(vy1, c, vl); vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl); - VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl); + vtx2 = VSET_VX2(vtx2, 0, vt0); + vtx2 = VSET_VX2(vtx2, 1, vt1); + vyx2 = VSET_VX2(vyx2, 0, vy0); + vyx2 = VSET_VX2(vyx2, 1, vy1); + + VSSSEG_FLOAT(x, stride_x, vtx2, vl); + VSSSEG_FLOAT(y, stride_y, vyx2, vl); } } diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 779fab68c..2586c6036 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -31,10 +31,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 @@ -43,10 +46,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m4(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 @@ -61,6 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F FLOAT_V_T vt, vr, vi; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); size_t vlmax = VSETVL_MAX; + FLOAT_VX2_T vx2; if(da_r == 0.0 && da_i == 0.0) { @@ -71,16 +78,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - - VSSEG_FLOAT(x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(x, vx2, vl); } } else { for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - - VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } @@ -89,12 +98,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vi, -da_i, vl); vi = VFMULVF_FLOAT(vr, da_i, vl); - VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } else if(da_i == 0.0) { @@ -102,12 +116,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vr = VFMULVF_FLOAT(vr, da_r, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); - VSSSEG_FLOAT(x, stride_x, vr, vi, vl); + vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } else { @@ -117,14 +135,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vr, &vi, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vr, da_r, vl); vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - VSSEG_FLOAT(x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSEG_FLOAT(x, vx2, vl); } } else { @@ -132,14 +154,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + vr = VGET_VX2(vx2, 0); + vi = VGET_VX2(vx2, 1); vt = VFMULVF_FLOAT(vr, da_r, vl); vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - VSSSEG_FLOAT(x, stride_x, vt, vi, vl); + vx2 = VSET_VX2(vx2, 0, vt); + vx2 = VSET_VX2(vx2, 1, vi); + VSSSEG_FLOAT(x, stride_x, vx2, vl); } } } diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c index b41f70eb5..489188bd5 100644 --- a/kernel/riscv64/zsum_rvv.c +++ b/kernel/riscv64/zsum_rvv.c @@ -32,8 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 @@ -44,8 +46,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL_MAX __riscv_vsetvlmax_e64m4() #define FLOAT_V_T vfloat64m4_t #define FLOAT_V_T_M1 vfloat64m1_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 @@ -59,6 +63,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(sumf); FLOAT_V_T v0, v1; + FLOAT_VX2_T vx2; size_t vlmax = VSETVL_MAX; FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax); @@ -67,7 +72,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&v0, &v1, x, vl); + vx2 = VLSEG_FLOAT(x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); @@ -80,7 +88,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl); + vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + v0 = VGET_VX2(vx2, 0); + v1 = VGET_VX2(vx2, 1); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl); v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl); diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c index 17b7b9f43..c2adf5e05 100644 --- a/kernel/riscv64/zswap_rvv.c +++ b/kernel/riscv64/zswap_rvv.c @@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4 -#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4 -#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e32_v_f32m4x2 #else #define VSETVL(n) __riscv_vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4 -#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4 -#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4 -#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 +#define VSSEG_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSSSEG_FLOAT __riscv_vssseg2e64_v_f64m4x2 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -48,7 +48,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm if (n <= 0) return(0); - FLOAT_V_T vx0, vx1, vy0, vy1; + FLOAT_VX2_T vxx2, vyx2; if (inc_x == 0 && inc_y == 0) { if (n & 1) { @@ -75,8 +75,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG m = n - 1; for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) { vl = VSETVL(m); - VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl); - VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl); + vyx2 = VLSSEG_FLOAT(ptr - 2, stride_y, vl); + VSSSEG_FLOAT(ptr, stride_y, vyx2, vl); } y[0] = temp[0]; y[1] = temp[1]; @@ -92,8 +92,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG m = n - 1; for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) { vl = VSETVL(m); - VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl); - VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl); + vxx2 = VLSSEG_FLOAT(ptr - 2, stride_x, vl); + VSSSEG_FLOAT(ptr, stride_x, vxx2, vl); } x[0] = temp[0]; x[1] = temp[1]; @@ -103,11 +103,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSEG_FLOAT(y, vl); - VSSEG_FLOAT(y, vx0, vx1, vl); - VSSEG_FLOAT(x, vy0, vy1, vl); + VSSEG_FLOAT(y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); } } else if (inc_x == 1){ @@ -116,11 +116,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSEG_FLOAT(&vx0, &vx1, x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSEG_FLOAT(x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); - VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); - VSSEG_FLOAT(x, vy0, vy1, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSEG_FLOAT(x, vyx2, vl); } } else if (inc_y == 1){ @@ -129,11 +129,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSEG_FLOAT(&vy0, &vy1, y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSEG_FLOAT(y, vl); - VSSEG_FLOAT(y, vx0, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + VSSEG_FLOAT(y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); } } else { @@ -143,11 +143,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) { vl = VSETVL(n); - VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl); - VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl); + vxx2 = VLSSEG_FLOAT(x, stride_x, vl); + vyx2 = VLSSEG_FLOAT(y, stride_y, vl); - VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl); - VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl); + VSSSEG_FLOAT(y, stride_y, vxx2, vl); + VSSSEG_FLOAT(x, stride_x, vyx2, vl); } } diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c index 0f9e04869..f4d806190 100644 --- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -47,12 +50,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -70,6 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda*2; FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool; INT_V_T vindex_max, vindex; @@ -85,15 +92,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl); - VLSEG2_FLOAT(&va10, &va11, ao1, vl); + va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl); + va1x2 = VLSEG2_FLOAT(ao1, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += lda * 2; diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c index fdc693700..069551bb0 100644 --- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e32m2() #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t #define VID_V_INT __riscv_vid_v_i32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 @@ -47,12 +50,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t #define VID_V_INT __riscv_vid_v_i64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 @@ -71,6 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda * 2; FLOAT_V_T vb0, vb1, va10, va11, va20, va21; + FLOAT_VX2_T va1x2, va2x2, vbx2; VBOOL_T vbool; INT_V_T vindex_max, vindex; @@ -86,15 +93,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 = a + posX * 2 + 0 + posY * lda * 2; for (i = m; i > 0; i--, offset--) { - VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl); - VLSEG2_FLOAT(&va20, &va21, ao2, vl); + va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl); + va2x2 = VLSEG2_FLOAT(ao2, vl); + + va20 = VGET_VX2(va2x2, 0); + va21 = VGET_VX2(va2x2, 1); + va10 = VGET_VX2(va1x2, 0); + va11 = VGET_VX2(va1x2, 1); vindex = VADD_VX_INT(vindex_max, offset, vl); vbool = VMSGT_VX_INT(vindex, 0, vl); vb0 = VMERGE_VVM_FLOAT(va20, va10, vbool, vl); vb1 = VMERGE_VVM_FLOAT(va21, va11, vbool, vl); - VSSEG2_FLOAT(b, vb0, vb1, vl); + + vbx2 = VSET_VX2(vbx2, 0, vb0); + vbx2 = VSET_VX2(vbx2, 1, vb1); + VSSEG2_FLOAT(b, vbx2, vl); b += vl * 2; ao1 += 2; diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c index 7276618c5..ae664561b 100644 --- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -32,12 +32,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vint32m2_t #define VID_V_UINT __riscv_vid_v_i32m2 @@ -47,12 +49,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -69,6 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT)*lda*2; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; @@ -98,8 +103,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { if (X > posY) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; @@ -119,7 +124,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -128,7 +136,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c index 72e8f2ce2..ab8d34337 100644 --- a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c @@ -32,11 +32,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -46,11 +48,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -65,6 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; #ifdef UNIT @@ -101,8 +106,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON else if (X < posY) { //va1 = VLEV_FLOAT(ao, vl); - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; @@ -115,7 +120,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON for (unsigned int j = 0; j < vl; j++) { //va1 = VLEV_FLOAT(ao, vl); - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -124,7 +132,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c index e6d36c86d..ba6e63b96 100644 --- a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c @@ -32,12 +32,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -47,12 +49,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -67,6 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG stride_lda = sizeof(FLOAT) * lda * 2; FLOAT *ao; + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; size_t vl; @@ -96,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON { if (X < posY) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; @@ -118,7 +123,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -127,7 +135,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c index 7085cfc37..a624fff54 100644 --- a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c @@ -34,11 +34,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 @@ -48,11 +50,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -66,6 +70,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, j, js, X; FLOAT *ao; + + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1; #ifdef UNIT VBOOL_T vbool_eq; @@ -103,8 +109,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else if (X > posY) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; X++; @@ -115,7 +121,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON vindex = VID_V_UINT(vl); for (j = 0; j < vl; j++) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); + vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl); @@ -124,7 +133,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON va0 = VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl); va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl); #endif - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(b, vax2, vl); ao += lda * 2; b += vl * 2; } diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c index 92b4b855b..db5f06af8 100644 --- a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c +++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c @@ -30,10 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) #define FLOAT_V_T vfloat32m2_t +#define FLOAT_VX2_T vfloat32m2x2_t +#define VGET_VX2 __riscv_vget_v_f32m2x2_f32m2 +#define VSET_VX2 __riscv_vset_v_f32m2_f32m2x2 #define VLEV_FLOAT __riscv_vle32_v_f32m2 #define VSEV_FLOAT __riscv_vse32_v_f32m2 -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m2 @@ -41,10 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t +#define FLOAT_VX2_T vfloat64m2x2_t +#define VGET_VX2 __riscv_vget_v_f64m2x2_f64m2 +#define VSET_VX2 __riscv_vset_v_f64m2_f64m2x2 #define VLEV_FLOAT __riscv_vle64_v_f64m2 #define VSEV_FLOAT __riscv_vse64_v_f64m2 -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m2 @@ -85,6 +91,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b off = 0; #endif + FLOAT_VX2_T vax2; FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; @@ -130,10 +137,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -158,7 +169,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -183,7 +196,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 8; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -233,7 +248,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -262,25 +279,37 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMULVF_FLOAT(vres2, alphar, vl); va3 = VFMULVF_FLOAT(vres3, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); va0 = VFMULVF_FLOAT(vres4, alphar, vl); va1 = VFMULVF_FLOAT(vres5, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres4, vl); - VSSEG2_FLOAT(C2, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C2, vax2, vl); va2 = VFMULVF_FLOAT(vres6, alphar, vl); va3 = VFMULVF_FLOAT(vres7, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres6, vl); - VSSEG2_FLOAT(C3, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C3, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; @@ -342,10 +371,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b #endif for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -360,7 +393,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -375,7 +410,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 4; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -405,7 +442,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -425,13 +464,19 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); va2 = VFMULVF_FLOAT(vres2, alphar, vl); va3 = VFMULVF_FLOAT(vres3, alphar, vl); va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl); va3 = VFMACCVF_FLOAT(va3, alphai, vres2, vl); - VSSEG2_FLOAT(C1, va2, va3, vl); + + vax2 = VSET_VX2(vax2, 0, va2); + vax2 = VSET_VX2(vax2, 1, va3); + VSSEG2_FLOAT(C1, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; @@ -487,10 +532,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b #endif for (k = temp/4; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; - VLSEG2_FLOAT(&va2, &va3, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va2 = VGET_VX2(vax2, 0); + va3 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -500,7 +549,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va4, &va5, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va4 = VGET_VX2(vax2, 0); + va5 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va2, vl); @@ -510,7 +561,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b ptrbb += 2; - VLSEG2_FLOAT(&va6, &va7, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va6 = VGET_VX2(vax2, 0); + va7 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va4, vl); @@ -530,7 +583,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b for (k = temp & 3; k > 0; k--) { - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); + vax2 = VLSEG2_FLOAT(ptrba, vl); + va0 = VGET_VX2(vax2, 0); + va1 = VGET_VX2(vax2, 1); ptrba += vl*2; vres0 = OP_rr(vres0, *(ptrbb + 0), va0, vl); @@ -545,7 +600,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b va1 = VFMULVF_FLOAT(vres1, alphar, vl); va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl); va1 = VFMACCVF_FLOAT(va1, alphai, vres0, vl); - VSSEG2_FLOAT(C0, va0, va1, vl); + + vax2 = VSET_VX2(vax2, 0, va0); + vax2 = VSET_VX2(vax2, 1, va1); + VSSEG2_FLOAT(C0, vax2, vl); #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) temp = bk - off; diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c index 383cb883f..36cec711d 100644 --- a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG stride_lda = sizeof(FLOAT)*lda*2; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; size_t vl; @@ -82,9 +82,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); ao += 2; @@ -97,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii > jj) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c index f57e9f1de..3a7bdb522 100644 --- a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -82,9 +82,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); b += vl * 2; ao += lda * 2; @@ -96,8 +96,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii < jj) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += lda * 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c index be3613429..2a158d4de 100644 --- a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c @@ -31,20 +31,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -62,7 +62,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *ao; jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -83,9 +83,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT for (unsigned int j = 0; j < vl; j++) { compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1)); - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); ao += 2; b += vl * 2; } @@ -96,8 +96,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii < jj) { - VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += 2; b += vl * 2; diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c index b1f5ef8f0..4b3319588 100644 --- a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c +++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c @@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2_m +#define FLOAT_VX2_T vfloat32m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e32_v_f32m2x2_m #define VBOOL_T vbool16_t #define UINT_V_T vuint32m2_t #define VID_V_UINT __riscv_vid_v_u32m2 #define VMSLTU_VX_UINT __riscv_vmsltu_vx_u32m2_b16 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2 -#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2_m +#define FLOAT_VX2_T vfloat64m2x2_t +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 +#define VSSEG2_FLOAT_M __riscv_vsseg2e64_v_f64m2x2_m #define VBOOL_T vbool32_t #define UINT_V_T vuint64m2_t #define VID_V_UINT __riscv_vid_v_u64m2 @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *ao; jj = offset; - FLOAT_V_T va0, va1; + FLOAT_VX2_T vax2; VBOOL_T vbool_cmp; UINT_V_T vindex; @@ -81,9 +81,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT vindex = VID_V_UINT(vl); for (unsigned int j = 0; j < vl; j++) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); + vax2 = VLSEG2_FLOAT(ao, vl); vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl); - VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl); + VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl); compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1)); @@ -97,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT { if (ii > jj) { - VLSEG2_FLOAT(&va0, &va1, ao, vl); - VSSEG2_FLOAT(b, va0, va1, vl); + vax2 = VLSEG2_FLOAT(ao, vl); + VSSEG2_FLOAT(b, vax2, vl); } ao += lda * 2; b += vl * 2; diff --git a/param.h b/param.h index c5c70b78e..d93221d28 100644 --- a/param.h +++ b/param.h @@ -3057,7 +3057,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_MN 16 +#define CGEMM_DEFAULT_UNROLL_MN 32 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 4 From ff41cf5c49bda6bf84950812e737b2e7bcddf139 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Fri, 17 Mar 2023 14:28:26 +0300 Subject: [PATCH 588/718] Fix BLAS, BLAS-like functions and Generic RISC-V kernels * Fixed gemmt, imatcopy, zimatcopy_cnc functions * Fixed cblas_cscal testing in ctest * Removed rotmg unreacheble code * Added zero size checks --- cblas.h | 8 ++ common_interface.h | 9 ++ ctest/c_cblat1.f | 10 +- ctest/c_cblat1c.c | 6 +- interface/gemmt.c | 233 ++++++++++++++++++++++----------- interface/imatcopy.c | 8 +- interface/rotmg.c | 28 +--- interface/zimatcopy.c | 6 +- kernel/generic/zimatcopy_cnc.c | 1 - kernel/riscv64/axpby.c | 2 +- kernel/riscv64/axpy.c | 2 +- kernel/riscv64/copy.c | 2 +- kernel/riscv64/dot.c | 2 +- kernel/riscv64/swap.c | 2 +- kernel/riscv64/zaxpy.c | 2 +- kernel/riscv64/zcopy.c | 2 +- kernel/riscv64/zswap.c | 2 +- 17 files changed, 201 insertions(+), 124 deletions(-) diff --git a/cblas.h b/cblas.h index c2bdd27fa..f7d36788d 100644 --- a/cblas.h +++ b/cblas.h @@ -289,6 +289,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); +void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); +void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); +void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); +void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, + OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); diff --git a/common_interface.h b/common_interface.h index 318827920..61a82c306 100644 --- a/common_interface.h +++ b/common_interface.h @@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double * void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); + int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index 1a123d74d..cad7c7fa7 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -96,7 +96,7 @@ INTEGER ICAMAXTEST EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST * .. External Subroutines .. - EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 + EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1 * .. Intrinsic Functions .. INTRINSIC MAX * .. Common blocks .. @@ -214,8 +214,8 @@ CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), + STRUE4(NP1),SFAC) ELSE IF (ICASE.EQ.8) THEN -* .. CSCAL .. - CALL CSCAL(N,CA,CX,INCX) +* .. CSCALTEST .. + CALL CSCALTEST(N,CA,CX,INCX) CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + SFAC) ELSE IF (ICASE.EQ.9) THEN @@ -236,14 +236,14 @@ * INCX = 1 IF (ICASE.EQ.8) THEN -* CSCAL +* CSCALTEST * Add a test for alpha equal to zero. CA = (0.0E0,0.0E0) DO 80 I = 1, 5 MWPCT(I) = (0.0E0,0.0E0) MWPCS(I) = (1.0E0,1.0E0) 80 CONTINUE - CALL CSCAL(5,CA,CX,INCX) + CALL CSCALTEST(5,CA,CX,INCX) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) ELSE IF (ICASE.EQ.9) THEN * CSSCALTEST diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c index 8c0dd140c..af29301af 100644 --- a/ctest/c_cblat1c.c +++ b/ctest/c_cblat1c.c @@ -685,7 +685,7 @@ real *sfac; static integer i__; extern /* Subroutine */ int ctest_(); static complex mwpcs[5], mwpct[5]; - extern /* Subroutine */ int itest1_(), stest1_(); + extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); static complex cx[8]; extern real scnrm2test_(); static integer np1; @@ -727,7 +727,7 @@ real *sfac; stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac); } else if (combla_1.icase == 8) { /* .. CSCAL .. */ - cscal_(&combla_1.n, &ca, cx, &combla_1.incx); + cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx); ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac); } else if (combla_1.icase == 9) { @@ -761,7 +761,7 @@ real *sfac; mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.; /* L80: */ } - cscal_(&c__5, &ca, cx, &combla_1.incx); + cscaltest_(&c__5, &ca, cx, &combla_1.incx); ctest_(&c__5, cx, mwpct, mwpcs, sfac); } else if (combla_1.icase == 9) { /* CSSCALTEST */ diff --git a/interface/gemmt.c b/interface/gemmt.c index 3eed1dfe4..a4530721c 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -35,29 +35,26 @@ #include #include #include "common.h" -#ifdef FUNCTION_PROFILE -#include "functable.h" -#endif #ifndef COMPLEX #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE -#define ERROR_NAME "QGEMT " +#define ERROR_NAME "QGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "DGEMT " +#define ERROR_NAME "DGEMMT " #elif defined(BFLOAT16) -#define ERROR_NAME "SBGEMT " +#define ERROR_NAME "SBGEMMT " #else -#define ERROR_NAME "SGEMT " +#define ERROR_NAME "SGEMMT " #endif #else #define SMP_THRESHOLD_MIN 8192.0 #ifdef XDOUBLE -#define ERROR_NAME "XGEMT " +#define ERROR_NAME "XGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "ZGEMT " +#define ERROR_NAME "ZGEMMT " #else -#define ERROR_NAME "CGEMT " +#define ERROR_NAME "CGEMMT " #endif #endif @@ -68,18 +65,22 @@ #ifndef CBLAS void NAME(char *UPLO, char *TRANSA, char *TRANSB, - blasint * M, blasint * N, blasint * K, + blasint * M, blasint * K, FLOAT * Alpha, IFLOAT * a, blasint * ldA, IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) { - blasint m, n, k; + blasint m, k; blasint lda, ldb, ldc; int transa, transb, uplo; blasint info; char transA, transB, Uplo; + blasint nrowa, nrowb; +#if defined(COMPLEX) + blasint ncolb; +#endif IFLOAT *buffer; IFLOAT *aa, *bb; FLOAT *cc; @@ -92,7 +93,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, PRINT_DEBUG_NAME; m = *M; - n = *N; k = *K; #if defined(COMPLEX) @@ -159,32 +159,47 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, if (Uplo == 'L') uplo = 1; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; +#if defined(COMPLEX) + ncolb = m; +#endif + if (transb & 1) { + nrowb = m; +#if defined(COMPLEX) + ncolb = k; +#endif + } + info = 0; - if (uplo < 0) - info = 14; - if (ldc < m) + if (ldc < MAX(1, m)) info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; - if (info) { + if (info != 0) { BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, - enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, - blasint N, blasint k, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, + blasint k, #ifndef COMPLEX FLOAT alpha, IFLOAT * A, blasint LDA, @@ -205,17 +220,23 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int transa, transb, uplo; blasint info; - blasint m, n, lda, ldb; + blasint lda, ldb; FLOAT *a, *b; +#if defined(COMPLEX) + blasint nrowb, ncolb; +#endif XFLOAT *buffer; PRINT_DEBUG_CNAME; + uplo = -1; transa = -1; transb = -1; info = 0; if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) transa = 0; @@ -248,9 +269,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, transb = 3; #endif - m = M; - n = N; - a = (void *)A; b = (void *)B; lda = LDA; @@ -258,23 +276,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - if (ldc < m) + blasint nrowa; +#if !defined(COMPLEX) + blasint nrowb; +#endif + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; +#if defined(COMPLEX) + ncolb = m; +#endif + if (transb & 1) { + nrowb = m; +#if defined(COMPLEX) + ncolb = k; +#endif + } + + if (ldc < MAX(1, m)) info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) - info = 2; + info = 3; if (transa < 0) + info = 2; + if (uplo < 0) info = 1; } if (order == CblasRowMajor) { - m = N; - n = M; a = (void *)B; b = (void *)A; @@ -282,6 +319,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, lda = LDB; ldb = LDA; + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + if (TransB == CblasNoTrans) transa = 0; if (TransB == CblasTrans) @@ -315,29 +355,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = -1; - if (ldc < m) + blasint ncola; +#if !defined(COMPLEX) + blasint ncolb; +#endif + ncola = m; + if (transa & 1) ncola = k; + ncolb = k; +#if defined(COMPLEX) + nrowb = m; +#endif + + if (transb & 1) { +#if defined(COMPLEX) + nrowb = k; +#endif + ncolb = m; + } + + if (ldc < MAX(1,m)) info = 13; + if (ldb < MAX(1, ncolb)) + info = 8; + if (lda < MAX(1, ncola)) + info = 10; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) - info = 3; + info = 4; if (transb < 0) info = 2; if (transa < 0) + info = 3; + if (uplo < 0) info = 1; - } - uplo = -1; - if (Uplo == CblasUpper) - uplo = 0; - if (Uplo == CblasLower) - uplo = 1; - if (uplo < 0) - info = 14; - if (info >= 0) { BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); return; @@ -407,37 +460,48 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif - if ((m == 0) || (n == 0)) + if (m == 0) return; IDEBUG_START; - FUNCTION_PROFILE_START(); +#if defined(COMPLEX) + if (transb > 1){ +#ifndef CBLAS + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#else + if (order == CblasColMajor) + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); + if (order == CblasRowMajor) + IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#endif + } +#endif - const blasint incb = (transb == 0) ? 1 : ldb; + const blasint incb = ((transb & 1) == 0) ? 1 : ldb; if (uplo == 1) { - for (i = 0; i < n; i++) { - j = n - i; + for (i = 0; i < m; i++) { + j = m - i; l = j; #if defined(COMPLEX) aa = a + i * 2; bb = b + i * ldb * 2; - if (transa) { - l = k; + if (transa & 1) { aa = a + lda * i * 2; - bb = b + i * 2; } + if (transb & 1) + bb = b + i * 2; cc = c + i * 2 * ldc + i * 2; #else aa = a + i; bb = b + i * ldb; - if (transa) { - l = k; + if (transa & 1) { aa = a + lda * i; - bb = b + i; } + if (transb & 1) + bb = b + i; cc = c + i * ldc + i; #endif @@ -447,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) - return; + continue; #else if (beta != ONE) SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); @@ -458,8 +522,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_START; - FUNCTION_PROFILE_START(); - buffer_size = j + k + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); @@ -479,20 +541,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + aa, lda, bb, incb, cc, 1, + buffer); #else + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, aa, lda, + bb, incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!(transa & 1)) (gemv_thread[(int)transa]) (j, k, alpha, aa, lda, bb, incb, cc, 1, buffer, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, cc, + 1, buffer, + nthreads); } #endif @@ -501,21 +577,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } } else { - for (i = 0; i < n; i++) { + for (i = 0; i < m; i++) { j = i + 1; l = j; #if defined COMPLEX bb = b + i * ldb * 2; - if (transa) { - l = k; + if (transb & 1) { bb = b + i * 2; } cc = c + i * 2 * ldc; #else bb = b + i * ldb; - if (transa) { - l = k; + if (transb & 1) { bb = b + i; } cc = c + i * ldc; @@ -527,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) - return; + continue; #else if (beta != ONE) SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); @@ -537,8 +611,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif IDEBUG_START; - FUNCTION_PROFILE_START(); - buffer_size = j + k + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); @@ -558,32 +630,41 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + a, lda, bb, incb, cc, 1, + buffer); #else + if (!(transa & 1)) (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb, + incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!(transa & 1)) (gemv_thread[(int)transa]) (j, k, alpha, a, lda, bb, incb, cc, 1, buffer, nthreads); - + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, cc, 1, + buffer, nthreads); } #endif STACK_FREE(buffer); } } - FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, - args.m * args.k + args.k * args.n + - args.m * args.n, 2 * args.m * args.n * args.k); IDEBUG_END; return; -} +} \ No newline at end of file diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 91975f7f4..109280fe6 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -149,10 +149,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif - if ( *lda > *ldb ) - msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT); - else - msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT); + if ( *rows > *cols ) + msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT); + else + msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) diff --git a/interface/rotmg.c b/interface/rotmg.c index 3a5ca8f95..b8f627221 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ else { dp2 = *dd2 * dy1; - if(dp2 == ZERO) - { - dflag = -TWO; - dparam[0] = dflag; - return; - } dp1 = *dd1 * *dx1; dq2 = dp2 * dy1; dq1 = dp1 * *dx1; @@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ dh12 = dp2 / dp1; du = ONE - dh12 * dh21; - if(du > ZERO) - { - dflag = ZERO; - *dd1 = *dd1 / du; - *dd2 = *dd2 / du; - *dx1 = *dx1 * du; - } else { - dflag = -ONE; - - dh11 = ZERO; - dh12 = ZERO; - dh21 = ZERO; - dh22 = ZERO; - - *dd1 = ZERO; - *dd2 = ZERO; - *dx1 = ZERO; - } + dflag = ZERO; + *dd1 = *dd1 / du; + *dd2 = *dd2 / du; + *dx1 = *dx1 * du; } else diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index ecda5ef4e..7d73ba572 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -171,10 +171,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, } #endif - if ( *lda > *ldb ) - msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2; + if ( *rows > *cols ) + msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2; else - msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2; b = malloc(msize); if ( b == NULL ) diff --git a/kernel/generic/zimatcopy_cnc.c b/kernel/generic/zimatcopy_cnc.c index 8e772bd8a..6426cffc0 100644 --- a/kernel/generic/zimatcopy_cnc.c +++ b/kernel/generic/zimatcopy_cnc.c @@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); - if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); aptr = a; lda *= 2; diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c index 278747f75..04f9518d3 100644 --- a/kernel/riscv64/axpby.c +++ b/kernel/riscv64/axpby.c @@ -33,7 +33,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG i=0; BLASLONG ix,iy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); ix = 0; iy = 0; diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c index fb1094dd9..19d12ad3f 100644 --- a/kernel/riscv64/axpy.c +++ b/kernel/riscv64/axpy.c @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS BLASLONG i=0; BLASLONG ix,iy; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( da == 0.0 ) return(0); ix = 0; diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c index 7b4f04f30..e79ca59af 100644 --- a/kernel/riscv64/copy.c +++ b/kernel/riscv64/copy.c @@ -41,7 +41,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG i=0; BLASLONG ix=0,iy=0; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); while(i < n) { diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index 46a84ad18..bf55998ca 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG ix=0,iy=0; double dot = 0.0 ; - if ( n < 0 ) return(dot); + if ( n < 1 ) return(dot); while(i < n) { diff --git a/kernel/riscv64/swap.c b/kernel/riscv64/swap.c index eac621fb2..33bbeeb6a 100644 --- a/kernel/riscv64/swap.c +++ b/kernel/riscv64/swap.c @@ -41,7 +41,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG ix=0,iy=0; FLOAT temp; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); while(i < n) { diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c index 1dcaeac27..18b6315cb 100644 --- a/kernel/riscv64/zaxpy.c +++ b/kernel/riscv64/zaxpy.c @@ -44,7 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); if ( da_r == 0.0 && da_i == 0.0 ) return(0); ix = 0; diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c index 07fe584c5..b0f19efd5 100644 --- a/kernel/riscv64/zcopy.c +++ b/kernel/riscv64/zcopy.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; diff --git a/kernel/riscv64/zswap.c b/kernel/riscv64/zswap.c index ae4760ae0..df1402b94 100644 --- a/kernel/riscv64/zswap.c +++ b/kernel/riscv64/zswap.c @@ -45,7 +45,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm BLASLONG inc_x2; BLASLONG inc_y2; - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; From 5b4df851d7581145f0aee4336f11127a3a7acc8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Mar 2023 08:29:05 +0100 Subject: [PATCH 589/718] fix stray blank on continuation line --- interface/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 6f320d8f7..a4d3f710a 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -92,8 +92,9 @@ CBLAS2OBJS = \ cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ - csbmv.$(SUFFIX) \ - cspr2.$(SUFFIX) \ + csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ + cspr.$(SUFFIX) cspr2.$(SUFFIX) \ + csymv.$(SUFFIX) csyr.$(SUFFIX) \ ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ chemv.$(SUFFIX) chbmv.$(SUFFIX) \ From 1c04df20bd9c845160b3eb2e51adaceb6f93cf8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 20 Mar 2023 23:04:12 +0100 Subject: [PATCH 590/718] Re-enable overriding the LAPACK SYMV,SYR,SPMV and SPR implementations --- lapack-netlib/SRC/Makefile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 49798b0c5..5f22789bd 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -572,22 +572,26 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o SLAPACKOBJS = \ sgetrf.o sgetrs.o spotrf.o sgetf2.o \ spotf2.o slaswp.o sgesv.o slauu2.o \ - slauum.o strti2.o strtri.o strtrs.o + slauum.o strti2.o strtri.o strtrs.o \ + ssymv.o ssyr.o sspmv.o sspr.o DLAPACKOBJS = \ dgetrf.o dgetrs.o dpotrf.o dgetf2.o \ dpotf2.o dlaswp.o dgesv.o dlauu2.o \ - dlauum.o dtrti2.o dtrtri.o dtrtrs.o + dlauum.o dtrti2.o dtrtri.o dtrtrs.o \ + dsymv.o dsyr.o dspmv.o dspr.o CLAPACKOBJS = \ cgetrf.o cgetrs.o cpotrf.o cgetf2.o \ cpotf2.o claswp.o cgesv.o clauu2.o \ - clauum.o ctrti2.o ctrtri.o ctrtrs.o + clauum.o ctrti2.o ctrtri.o ctrtrs.o \ + csymv.o csyr.o cspmv.o cspr.o ZLAPACKOBJS = \ zgetrf.o zgetrs.o zpotrf.o zgetf2.o \ zpotf2.o zlaswp.o zgesv.o zlauu2.o \ - zlauum.o ztrti2.o ztrtri.o ztrtrs.o + zlauum.o ztrti2.o ztrtri.o ztrtrs.o \ + zsymv.o zsyr.o zspmv.o zspr.o ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) From 5222b5fc18829265be7ffc77e77271a18f17c005 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Thu, 12 Oct 2023 22:06:00 +0300 Subject: [PATCH 591/718] Added axpby kernels for GENERIC RISC-V target --- kernel/riscv64/KERNEL.RISCV64_GENERIC | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index 61a8a2b91..15bcd2289 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -45,6 +45,11 @@ DAXPYKERNEL = ../riscv64/axpy.c CAXPYKERNEL = ../riscv64/zaxpy.c ZAXPYKERNEL = ../riscv64/zaxpy.c +SAXPBYKERNEL = ../riscv64/axpby.c +DAXPBYKERNEL = ../riscv64/axpby.c +CAXPBYKERNEL = ../riscv64/zaxpby.c +ZAXPBYKERNEL = ../riscv64/zaxpby.c + SCOPYKERNEL = ../riscv64/copy.c DCOPYKERNEL = ../riscv64/copy.c CCOPYKERNEL = ../riscv64/zcopy.c From f1291614536d7d1bec6508fda9b0c56dd7286bb3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Mar 2023 07:43:03 +0100 Subject: [PATCH 592/718] restore C/Z SPMV, SPR, SYR,SYMV --- interface/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index a4d3f710a..3db4b2b6d 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -122,8 +122,9 @@ ZBLAS2OBJS = \ zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ - zsbmv.$(SUFFIX) \ - zspr2.$(SUFFIX) \ + zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ + zspr.$(SUFFIX) zspr2.$(SUFFIX) \ + zsymv.$(SUFFIX) zsyr.$(SUFFIX) \ ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ From 85548e66ca25228a73ec08c257d5d92108b94b62 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Mar 2023 16:33:09 +0200 Subject: [PATCH 593/718] Fix build failures seen with the NO_LAPACK option - cspr/csymv/csyr belong on the LAPACK list --- interface/Makefile | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 3db4b2b6d..6f320d8f7 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -92,9 +92,8 @@ CBLAS2OBJS = \ cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ - csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ - cspr.$(SUFFIX) cspr2.$(SUFFIX) \ - csymv.$(SUFFIX) csyr.$(SUFFIX) \ + csbmv.$(SUFFIX) \ + cspr2.$(SUFFIX) \ ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ chemv.$(SUFFIX) chbmv.$(SUFFIX) \ @@ -122,9 +121,8 @@ ZBLAS2OBJS = \ zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ - zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ - zspr.$(SUFFIX) zspr2.$(SUFFIX) \ - zsymv.$(SUFFIX) zsyr.$(SUFFIX) \ + zsbmv.$(SUFFIX) \ + zspr2.$(SUFFIX) \ ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ From f7cf637d7aad0990625f41f83db74446a5908509 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Mar 2023 18:31:04 +0200 Subject: [PATCH 594/718] redo lost edit --- interface/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 6f320d8f7..275b71a1c 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -445,7 +445,8 @@ QLAPACKOBJS = \ CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ - clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) + clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) \ + cspr.$(SUFFIX) cspmv.$(SUFFIX) csymv.$(SUFFIX) csyr.$(SUFFIX) #ZLAPACKOBJS = \ # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ @@ -456,8 +457,8 @@ CLAPACKOBJS = \ ZLAPACKOBJS = \ zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ - zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) - + zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) \ + zspr.$(SUFFIX) zspmv.$(SUFFIX) zsymv.$(SUFFIX) zsyr.$(SUFFIX) XLAPACKOBJS = \ xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ From f89e0034a479016ab5d9e1681abf07dab7f8cf38 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Wed, 20 Dec 2023 21:20:30 +0300 Subject: [PATCH 595/718] Fix LAPACK usage from BLAS --- interface/Makefile | 7 +++---- lapack-netlib/SRC/Makefile | 12 ++++-------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 275b71a1c..6f320d8f7 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -445,8 +445,7 @@ QLAPACKOBJS = \ CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ - clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) \ - cspr.$(SUFFIX) cspmv.$(SUFFIX) csymv.$(SUFFIX) csyr.$(SUFFIX) + clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) #ZLAPACKOBJS = \ # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ @@ -457,8 +456,8 @@ CLAPACKOBJS = \ ZLAPACKOBJS = \ zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ - zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) \ - zspr.$(SUFFIX) zspmv.$(SUFFIX) zsymv.$(SUFFIX) zsyr.$(SUFFIX) + zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) + XLAPACKOBJS = \ xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 5f22789bd..49798b0c5 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -572,26 +572,22 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o SLAPACKOBJS = \ sgetrf.o sgetrs.o spotrf.o sgetf2.o \ spotf2.o slaswp.o sgesv.o slauu2.o \ - slauum.o strti2.o strtri.o strtrs.o \ - ssymv.o ssyr.o sspmv.o sspr.o + slauum.o strti2.o strtri.o strtrs.o DLAPACKOBJS = \ dgetrf.o dgetrs.o dpotrf.o dgetf2.o \ dpotf2.o dlaswp.o dgesv.o dlauu2.o \ - dlauum.o dtrti2.o dtrtri.o dtrtrs.o \ - dsymv.o dsyr.o dspmv.o dspr.o + dlauum.o dtrti2.o dtrtri.o dtrtrs.o CLAPACKOBJS = \ cgetrf.o cgetrs.o cpotrf.o cgetf2.o \ cpotf2.o claswp.o cgesv.o clauu2.o \ - clauum.o ctrti2.o ctrtri.o ctrtrs.o \ - csymv.o csyr.o cspmv.o cspr.o + clauum.o ctrti2.o ctrtri.o ctrtrs.o ZLAPACKOBJS = \ zgetrf.o zgetrs.o zpotrf.o zgetf2.o \ zpotf2.o zlaswp.o zgesv.o zlauu2.o \ - zlauum.o ztrti2.o ztrtri.o ztrtrs.o \ - zsymv.o zsyr.o zspmv.o zspr.o + zlauum.o ztrti2.o ztrtri.o ztrtrs.o ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) From bf39c0d8b53c8899fb81a9515da91a004d25ec0b Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Fri, 23 Jun 2023 14:51:39 +0300 Subject: [PATCH 596/718] Added new tests for BLAS-like and BLAS API in utest --- .gitignore | 1 + utest/CMakeLists.txt | 75 + utest/Makefile | 30 +- utest/test_extensions/common.c | 259 +++ utest/test_extensions/common.h | 76 + utest/test_extensions/test_caxpby.c | 631 ++++++++ utest/test_extensions/test_caxpyc.c | 158 ++ utest/test_extensions/test_cgbmv.c | 279 ++++ utest/test_extensions/test_cgeadd.c | 880 +++++++++++ utest/test_extensions/test_cgemm.c | 273 ++++ utest/test_extensions/test_cgemmt.c | 2010 ++++++++++++++++++++++++ utest/test_extensions/test_cgemv_n.c | 340 ++++ utest/test_extensions/test_cgemv_t.c | 1132 +++++++++++++ utest/test_extensions/test_cimatcopy.c | 850 ++++++++++ utest/test_extensions/test_comatcopy.c | 728 +++++++++ utest/test_extensions/test_crot.c | 792 ++++++++++ utest/test_extensions/test_crotg.c | 290 ++++ utest/test_extensions/test_csbmv.c | 606 +++++++ utest/test_extensions/test_cscal.c | 164 ++ utest/test_extensions/test_cspmv.c | 428 +++++ utest/test_extensions/test_ctrmv.c | 266 ++++ utest/test_extensions/test_ctrsv.c | 267 ++++ utest/test_extensions/test_damin.c | 354 +++++ utest/test_extensions/test_daxpby.c | 799 ++++++++++ utest/test_extensions/test_dgeadd.c | 878 +++++++++++ utest/test_extensions/test_dgemmt.c | 1442 +++++++++++++++++ utest/test_extensions/test_dimatcopy.c | 947 +++++++++++ utest/test_extensions/test_domatcopy.c | 672 ++++++++ utest/test_extensions/test_drotmg.c | 414 +++++ utest/test_extensions/test_dsum.c | 403 +++++ utest/test_extensions/test_dzamax.c | 294 ++++ utest/test_extensions/test_dzamin.c | 310 ++++ utest/test_extensions/test_dzsum.c | 403 +++++ utest/test_extensions/test_icamin.c | 625 ++++++++ utest/test_extensions/test_idamin.c | 787 ++++++++++ utest/test_extensions/test_isamin.c | 787 ++++++++++ utest/test_extensions/test_izamin.c | 625 ++++++++ utest/test_extensions/test_samin.c | 354 +++++ utest/test_extensions/test_saxpby.c | 794 ++++++++++ utest/test_extensions/test_scamax.c | 294 ++++ utest/test_extensions/test_scamin.c | 310 ++++ utest/test_extensions/test_scsum.c | 403 +++++ utest/test_extensions/test_sgeadd.c | 880 +++++++++++ utest/test_extensions/test_sgemmt.c | 1442 +++++++++++++++++ utest/test_extensions/test_simatcopy.c | 947 +++++++++++ utest/test_extensions/test_somatcopy.c | 672 ++++++++ utest/test_extensions/test_srotmg.c | 414 +++++ utest/test_extensions/test_ssum.c | 403 +++++ utest/test_extensions/test_zaxpby.c | 630 ++++++++ utest/test_extensions/test_zaxpyc.c | 159 ++ utest/test_extensions/test_zgbmv.c | 280 ++++ utest/test_extensions/test_zgeadd.c | 880 +++++++++++ utest/test_extensions/test_zgemm.c | 273 ++++ utest/test_extensions/test_zgemmt.c | 2010 ++++++++++++++++++++++++ utest/test_extensions/test_zgemv_n.c | 341 ++++ utest/test_extensions/test_zgemv_t.c | 1136 +++++++++++++ utest/test_extensions/test_zimatcopy.c | 850 ++++++++++ utest/test_extensions/test_zomatcopy.c | 745 +++++++++ utest/test_extensions/test_zrot.c | 790 ++++++++++ utest/test_extensions/test_zrotg.c | 290 ++++ utest/test_extensions/test_zsbmv.c | 606 +++++++ utest/test_extensions/test_zscal.c | 165 ++ utest/test_extensions/test_zspmv.c | 427 +++++ utest/test_extensions/test_ztrmv.c | 266 ++++ utest/test_extensions/test_ztrsv.c | 267 ++++ utest/test_extensions/xerbla.c | 88 ++ 66 files changed, 37387 insertions(+), 4 deletions(-) create mode 100644 utest/test_extensions/common.c create mode 100644 utest/test_extensions/common.h create mode 100644 utest/test_extensions/test_caxpby.c create mode 100644 utest/test_extensions/test_caxpyc.c create mode 100644 utest/test_extensions/test_cgbmv.c create mode 100644 utest/test_extensions/test_cgeadd.c create mode 100644 utest/test_extensions/test_cgemm.c create mode 100644 utest/test_extensions/test_cgemmt.c create mode 100644 utest/test_extensions/test_cgemv_n.c create mode 100644 utest/test_extensions/test_cgemv_t.c create mode 100644 utest/test_extensions/test_cimatcopy.c create mode 100644 utest/test_extensions/test_comatcopy.c create mode 100644 utest/test_extensions/test_crot.c create mode 100644 utest/test_extensions/test_crotg.c create mode 100644 utest/test_extensions/test_csbmv.c create mode 100644 utest/test_extensions/test_cscal.c create mode 100644 utest/test_extensions/test_cspmv.c create mode 100644 utest/test_extensions/test_ctrmv.c create mode 100644 utest/test_extensions/test_ctrsv.c create mode 100644 utest/test_extensions/test_damin.c create mode 100644 utest/test_extensions/test_daxpby.c create mode 100644 utest/test_extensions/test_dgeadd.c create mode 100644 utest/test_extensions/test_dgemmt.c create mode 100644 utest/test_extensions/test_dimatcopy.c create mode 100644 utest/test_extensions/test_domatcopy.c create mode 100644 utest/test_extensions/test_drotmg.c create mode 100644 utest/test_extensions/test_dsum.c create mode 100644 utest/test_extensions/test_dzamax.c create mode 100644 utest/test_extensions/test_dzamin.c create mode 100644 utest/test_extensions/test_dzsum.c create mode 100644 utest/test_extensions/test_icamin.c create mode 100644 utest/test_extensions/test_idamin.c create mode 100644 utest/test_extensions/test_isamin.c create mode 100644 utest/test_extensions/test_izamin.c create mode 100644 utest/test_extensions/test_samin.c create mode 100644 utest/test_extensions/test_saxpby.c create mode 100644 utest/test_extensions/test_scamax.c create mode 100644 utest/test_extensions/test_scamin.c create mode 100644 utest/test_extensions/test_scsum.c create mode 100644 utest/test_extensions/test_sgeadd.c create mode 100644 utest/test_extensions/test_sgemmt.c create mode 100644 utest/test_extensions/test_simatcopy.c create mode 100644 utest/test_extensions/test_somatcopy.c create mode 100644 utest/test_extensions/test_srotmg.c create mode 100644 utest/test_extensions/test_ssum.c create mode 100644 utest/test_extensions/test_zaxpby.c create mode 100644 utest/test_extensions/test_zaxpyc.c create mode 100644 utest/test_extensions/test_zgbmv.c create mode 100644 utest/test_extensions/test_zgeadd.c create mode 100644 utest/test_extensions/test_zgemm.c create mode 100644 utest/test_extensions/test_zgemmt.c create mode 100644 utest/test_extensions/test_zgemv_n.c create mode 100644 utest/test_extensions/test_zgemv_t.c create mode 100644 utest/test_extensions/test_zimatcopy.c create mode 100644 utest/test_extensions/test_zomatcopy.c create mode 100644 utest/test_extensions/test_zrot.c create mode 100644 utest/test_extensions/test_zrotg.c create mode 100644 utest/test_extensions/test_zsbmv.c create mode 100644 utest/test_extensions/test_zscal.c create mode 100644 utest/test_extensions/test_zspmv.c create mode 100644 utest/test_extensions/test_ztrmv.c create mode 100644 utest/test_extensions/test_ztrsv.c create mode 100644 utest/test_extensions/xerbla.c diff --git a/.gitignore b/.gitignore index 0fe20ecaa..e3e783c46 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,7 @@ config_last.h getarch getarch_2nd utest/openblas_utest +utest/openblas_utest_ext ctest/xccblat1 ctest/xccblat2 ctest/xccblat3 diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 2e32827d3..d78701707 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -18,6 +18,69 @@ else () ) endif () + +set(DIR_EXT test_extensions) +set(OpenBLAS_utest_ext_src +utest_main.c +${DIR_EXT}/xerbla.c +${DIR_EXT}/test_isamin.c +${DIR_EXT}/test_idamin.c +${DIR_EXT}/test_icamin.c +${DIR_EXT}/test_izamin.c +${DIR_EXT}/test_ssum.c +${DIR_EXT}/test_dsum.c +${DIR_EXT}/test_scsum.c +${DIR_EXT}/test_dzsum.c +${DIR_EXT}/test_samin.c +${DIR_EXT}/test_damin.c +${DIR_EXT}/test_scamin.c +${DIR_EXT}/test_dzamin.c +${DIR_EXT}/test_scamax.c +${DIR_EXT}/test_dzamax.c +${DIR_EXT}/test_zrotg.c +${DIR_EXT}/test_crotg.c +$(DIR_EXT)/test_drotmg.c +$(DIR_EXT)/test_srotmg.c +$(DIR_EXT)/test_zscal.c +$(DIR_EXT)/test_cscal.c +$(DIR_EXT)/test_domatcopy.c +$(DIR_EXT)/test_somatcopy.c +$(DIR_EXT)/test_zomatcopy.c +$(DIR_EXT)/test_comatcopy.c +${DIR_EXT}/test_simatcopy.c +${DIR_EXT}/test_dimatcopy.c +${DIR_EXT}/test_cimatcopy.c +${DIR_EXT}/test_zimatcopy.c +${DIR_EXT}/test_sgeadd.c +${DIR_EXT}/test_dgeadd.c +${DIR_EXT}/test_cgeadd.c +${DIR_EXT}/test_zgeadd.c +${DIR_EXT}/test_saxpby.c +${DIR_EXT}/test_daxpby.c +${DIR_EXT}/test_caxpby.c +${DIR_EXT}/test_zaxpby.c +${DIR_EXT}/test_caxpyc.c +${DIR_EXT}/test_zaxpyc.c +$(DIR_EXT)/test_cgemv_t.c +$(DIR_EXT)/test_zgemv_t.c +$(DIR_EXT)/test_cgemv_n.c +$(DIR_EXT)/test_zgemv_n.c +${DIR_EXT}/test_crot.c +${DIR_EXT}/test_zrot.c +${DIR_EXT}/test_cgbmv.c +${DIR_EXT}/test_zgbmv.c +${DIR_EXT}/test_dgemmt.c +${DIR_EXT}/test_sgemmt.c +${DIR_EXT}/test_cgemmt.c +${DIR_EXT}/test_zgemmt.c +${DIR_EXT}/test_ztrmv.c +${DIR_EXT}/test_ctrmv.c +$(DIR_EXT)/test_ztrsv.c +$(DIR_EXT)/test_ctrsv.c +$(DIR_EXT)/test_zgemm.c +$(DIR_EXT)/test_cgemm.c +) + # crashing on travis cl with an error code suggesting resource not found if (NOT MSVC) set(OpenBLAS_utest_src @@ -46,6 +109,13 @@ set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c ) +set(OpenBLAS_utest_ext_src + ${OpenBLAS_utest_ext_src} + ${DIR_EXT}/test_cspmv.c + ${DIR_EXT}/test_zspmv.c + ${DIR_EXT}/test_csbmv.c + ${DIR_EXT}/test_zsbmv.c + ) if (NOT NO_CBLAS AND NOT NO_LAPACKE) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} @@ -57,7 +127,11 @@ endif() set(OpenBLAS_utest_bin openblas_utest) add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) +set(OpenBLAS_utest_ext_bin openblas_utest_ext) +add_executable(${OpenBLAS_utest_ext_bin} ${OpenBLAS_utest_ext_src}) + target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME}) +target_link_libraries(${OpenBLAS_utest_ext_bin} ${OpenBLAS_LIBNAME}) if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX" ) target_link_libraries(${OpenBLAS_utest_bin} m) @@ -82,3 +156,4 @@ add_custom_command(TARGET ${OpenBLAS_utest_bin} endif() add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin}) +add_test(${OpenBLAS_utest_ext_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin}) diff --git a/utest/Makefile b/utest/Makefile index f99035440..55561c770 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,21 +1,38 @@ UTEST_CHECK = 1 TOPDIR = .. +DIR_EXT=test_extensions override TARGET_ARCH= override TARGET_MACH= UTESTBIN=openblas_utest +UTESTEXTBIN=openblas_utest_ext .PHONY : all -.NOTPARALLEL : all run_test $(UTESTBIN) +.NOTPARALLEL : all run_test $(UTESTBIN) $(UTESTEXTBIN) include $(TOPDIR)/Makefile.system OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o +OBJS_EXT=utest_main.o $(DIR_EXT)/xerbla.o $(DIR_EXT)/common.o +OBJS_EXT+=$(DIR_EXT)/test_isamin.o $(DIR_EXT)/test_idamin.o $(DIR_EXT)/test_icamin.o $(DIR_EXT)/test_izamin.o +OBJS_EXT+=$(DIR_EXT)/test_ssum.o $(DIR_EXT)/test_dsum.o $(DIR_EXT)/test_scsum.o $(DIR_EXT)/test_dzsum.o +OBJS_EXT+=$(DIR_EXT)/test_saxpby.o $(DIR_EXT)/test_daxpby.o $(DIR_EXT)/test_caxpby.o $(DIR_EXT)/test_zaxpby.o $(DIR_EXT)/test_zaxpyc.o $(DIR_EXT)/test_caxpyc.o +OBJS_EXT+=$(DIR_EXT)/test_samin.o $(DIR_EXT)/test_damin.o $(DIR_EXT)/test_scamin.o $(DIR_EXT)/test_dzamin.o $(DIR_EXT)/test_scamax.o $(DIR_EXT)/test_dzamax.o +OBJS_EXT+=$(DIR_EXT)/test_drotmg.o $(DIR_EXT)/test_srotmg.o $(DIR_EXT)/test_zrotg.o $(DIR_EXT)/test_crotg.o $(DIR_EXT)/test_crot.o $(DIR_EXT)/test_zrot.o +OBJS_EXT+=$(DIR_EXT)/test_zscal.o $(DIR_EXT)/test_cscal.o +OBJS_EXT+=$(DIR_EXT)/test_domatcopy.o $(DIR_EXT)/test_somatcopy.o $(DIR_EXT)/test_zomatcopy.o $(DIR_EXT)/test_comatcopy.o +OBJS_EXT+=$(DIR_EXT)/test_simatcopy.o $(DIR_EXT)/test_dimatcopy.o $(DIR_EXT)/test_cimatcopy.o $(DIR_EXT)/test_zimatcopy.o +OBJS_EXT+=$(DIR_EXT)/test_sgeadd.o $(DIR_EXT)/test_dgeadd.o $(DIR_EXT)/test_cgeadd.o $(DIR_EXT)/test_zgeadd.o +OBJS_EXT+=$(DIR_EXT)/test_cgemv_t.o $(DIR_EXT)/test_zgemv_t.o $(DIR_EXT)/test_cgemv_n.o $(DIR_EXT)/test_zgemv_n.o +OBJS_EXT+=$(DIR_EXT)/test_sgemmt.o $(DIR_EXT)/test_dgemmt.o $(DIR_EXT)/test_cgemmt.o $(DIR_EXT)/test_zgemmt.o +OBJS_EXT+=$(DIR_EXT)/test_ztrmv.o $(DIR_EXT)/test_ctrmv.o $(DIR_EXT)/test_ztrsv.o $(DIR_EXT)/test_ctrsv.o +OBJS_EXT+=$(DIR_EXT)/test_zgemm.o $(DIR_EXT)/test_cgemm.o $(DIR_EXT)/test_zgbmv.o $(DIR_EXT)/test_cgbmv.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o +OBJS_EXT += $(DIR_EXT)/test_zspmv.o $(DIR_EXT)/test_cspmv.o $(DIR_EXT)/test_zsbmv.o $(DIR_EXT)/test_csbmv.o ifneq ($(NO_CBLAS), 1) ifneq ($(NO_LAPACKE), 1) OBJS += test_kernel_regress.o @@ -47,12 +64,17 @@ all : run_test $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -run_test: $(UTESTBIN) +$(UTESTEXTBIN): $(OBJS_EXT) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) + +run_test: $(UTESTBIN) $(UTESTEXTBIN) ifneq ($(CROSS), 1) ./$(UTESTBIN) + ./$(UTESTEXTBIN) endif clean: - -rm -f *.o $(UTESTBIN) + -rm -f *.o $(UTESTBIN) $(UTESTEXTBIN) + -rm -f $(DIR_EXT)/*.o -libs: +libs: \ No newline at end of file diff --git a/utest/test_extensions/common.c b/utest/test_extensions/common.c new file mode 100644 index 000000000..c3bdcefc7 --- /dev/null +++ b/utest/test_extensions/common.c @@ -0,0 +1,259 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +/** + * Generate random array + */ +void srand_generate(float *alpha, blasint n) +{ + blasint i; + for (i = 0; i < n; i++) + alpha[i] = (float)rand() / (float)RAND_MAX * 5.0f; +} + +void drand_generate(double *alpha, blasint n) +{ + blasint i; + for (i = 0; i < n; i++) + alpha[i] = (double)rand() / (double)RAND_MAX * 5.0; +} + +/** + * Find difference between two rectangle matrix + * return norm of differences + */ +float smatrix_difference(float *a, float *b, blasint cols, blasint rows, blasint ld) +{ + blasint i = 0; + blasint j = 0; + blasint inc = 1; + float norm = 0.0f; + + float *a_ptr = a; + float *b_ptr = b; + + for(i = 0; i < rows; i++) + { + for (j = 0; j < cols; j++) { + a_ptr[j] -= b_ptr[j]; + } + norm += cblas_snrm2(cols, a_ptr, inc); + + a_ptr += ld; + b_ptr += ld; + } + return norm/(float)(rows); +} + +double dmatrix_difference(double *a, double *b, blasint cols, blasint rows, blasint ld) +{ + blasint i = 0; + blasint j = 0; + blasint inc = 1; + double norm = 0.0; + + double *a_ptr = a; + double *b_ptr = b; + + for(i = 0; i < rows; i++) + { + for (j = 0; j < cols; j++) { + a_ptr[j] -= b_ptr[j]; + } + norm += cblas_dnrm2(cols, a_ptr, inc); + + a_ptr += ld; + b_ptr += ld; + } + return norm/(double)(rows); +} + +/** + * Complex conjugate operation for vector + * + * param n specifies number of elements in vector x + * param inc_x specifies increment of vector x + * param x_ptr specifies buffer holding vector x + */ +void cconjugate_vector(blasint n, blasint inc_x, float *x_ptr) +{ + blasint i; + inc_x *= 2; + + for (i = 0; i < n; i++) + { + x_ptr[1] *= (-1.0f); + x_ptr += inc_x; + } +} + +void zconjugate_vector(blasint n, blasint inc_x, double *x_ptr) +{ + blasint i; + inc_x *= 2; + + for (i = 0; i < n; i++) + { + x_ptr[1] *= (-1.0); + x_ptr += inc_x; + } +} + +/** + * Transpose matrix + * + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param a_src - buffer holding input matrix A + * param lda_src - leading dimension of the matrix A + * param a_dst - buffer holding output matrix A + * param lda_dst - leading dimension of output matrix A + */ +void stranspose(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst) +{ + blasint i, j; + for (i = 0; i != cols; i++) + { + for (j = 0; j != rows; j++) + a_dst[i*lda_dst+j] = alpha*a_src[j*lda_src+i]; + } +} + +void dtranspose(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst) +{ + blasint i, j; + for (i = 0; i != cols; i++) + { + for (j = 0; j != rows; j++) + a_dst[i*lda_dst+j] = alpha*a_src[j*lda_src+i]; + } +} + +void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst, int conj) +{ + blasint i, j; + lda_dst *= 2; + lda_src *= 2; + for (i = 0; i != cols*2; i+=2) + { + for (j = 0; j != rows*2; j+=2){ + a_dst[(i/2)*lda_dst+j] = alpha[0] * a_src[(j/2)*lda_src+i] + conj * alpha[1] * a_src[(j/2)*lda_src+i+1]; + a_dst[(i/2)*lda_dst+j+1] = (-1.0f) * conj * alpha[0] * a_src[(j/2)*lda_src+i+1] + alpha[1] * a_src[(j/2)*lda_src+i]; + } + } +} + +void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst, int conj) +{ + blasint i, j; + lda_dst *= 2; + lda_src *= 2; + for (i = 0; i != cols*2; i+=2) + { + for (j = 0; j != rows*2; j+=2){ + a_dst[(i/2)*lda_dst+j] = alpha[0] * a_src[(j/2)*lda_src+i] + conj * alpha[1] * a_src[(j/2)*lda_src+i+1]; + a_dst[(i/2)*lda_dst+j+1] = (-1.0) * conj * alpha[0] * a_src[(j/2)*lda_src+i+1] + alpha[1] * a_src[(j/2)*lda_src+i]; + } + } +} + +/** + * Copy matrix from source A to destination A + * + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param a_src - buffer holding input matrix A + * param lda_src - leading dimension of the matrix A + * param a_dst - buffer holding output matrix A + * param lda_dst - leading dimension of output matrix A + * param conj specifies conjugation + */ +void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst) +{ + blasint i, j; + for (i = 0; i != rows; i++) + { + for (j = 0; j != cols; j++) + a_dst[i*lda_dst+j] = alpha*a_src[i*lda_src+j]; + } +} + +void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst) +{ + blasint i, j; + for (i = 0; i != rows; i++) + { + for (j = 0; j != cols; j++) + a_dst[i*lda_dst+j] = alpha*a_src[i*lda_src+j]; + } +} + +void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst, int conj) +{ + blasint i, j; + lda_dst *= 2; + lda_src *= 2; + for (i = 0; i != rows; i++) + { + for (j = 0; j != cols*2; j+=2){ + a_dst[i*lda_dst+j] = alpha[0] * a_src[i*lda_src+j] + conj * alpha[1] * a_src[i*lda_src+j+1]; + a_dst[i*lda_dst+j+1] = (-1.0f) * conj *alpha[0] * a_src[i*lda_src+j+1] + alpha[1] * a_src[i*lda_src+j]; + } + } +} + +void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst, int conj) +{ + blasint i, j; + lda_dst *= 2; + lda_src *= 2; + for (i = 0; i != rows; i++) + { + for (j = 0; j != cols*2; j+=2){ + a_dst[i*lda_dst+j] = alpha[0] * a_src[i*lda_src+j] + conj * alpha[1] * a_src[i*lda_src+j+1]; + a_dst[i*lda_dst+j+1] = (-1.0) * conj *alpha[0] * a_src[i*lda_src+j+1] + alpha[1] * a_src[i*lda_src+j]; + } + } +} \ No newline at end of file diff --git a/utest/test_extensions/common.h b/utest/test_extensions/common.h new file mode 100644 index 000000000..62b84325c --- /dev/null +++ b/utest/test_extensions/common.h @@ -0,0 +1,76 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#ifndef _TEST_EXTENSION_COMMON_H_ +#define _TEST_EXTENSION_COMMON_H_ + +#include +#include + +#define TRUE 1 +#define FALSE 0 +#define INVALID -1 +#define SINGLE_TOL 1e-02f +#define DOUBLE_TOL 1e-10 + +extern int check_error(void); +extern void set_xerbla(char* current_rout, int expected_info); +extern int BLASFUNC(xerbla)(char *name, blasint *info, blasint length); + +extern void srand_generate(float *alpha, blasint n); +extern void drand_generate(double *alpha, blasint n); + +extern float smatrix_difference(float *a, float *b, blasint cols, blasint rows, blasint ld); +extern double dmatrix_difference(double *a, double *b, blasint cols, blasint rows, blasint ld); + +extern void cconjugate_vector(blasint n, blasint inc_x, float *x_ptr); +extern void zconjugate_vector(blasint n, blasint inc_x, double *x_ptr); + +extern void stranspose(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst); +extern void dtranspose(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst); +extern void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst, int conj); +extern void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst, int conj); + +extern void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst); +extern void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst); +extern void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, + float *a_dst, blasint lda_dst, int conj); +extern void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, + double *a_dst, blasint lda_dst, int conj); +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_caxpby.c b/utest/test_extensions/test_caxpby.c new file mode 100644 index 000000000..221a48ac7 --- /dev/null +++ b/utest/test_extensions/test_caxpby.c @@ -0,0 +1,631 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CAXPBY { + float x_test[DATASIZE * INCREMENT * 2]; + float x_verify[DATASIZE * INCREMENT * 2]; + float y_test[DATASIZE * INCREMENT * 2]; + float y_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CAXPBY data_caxpby; + +/** + * Fortran API specific function + * Test caxpby by comparing it with cscal and caxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static float check_caxpby(blasint n, float *alpha, blasint incx, float *beta, blasint incy) +{ + blasint i; + + // cscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + srand_generate(data_caxpby.x_test, n * incx_abs * 2); + srand_generate(data_caxpby.y_test, n * incy_abs * 2); + + // Copy vector x for caxpy + for (i = 0; i < n * incx_abs * 2; i++) + data_caxpby.x_verify[i] = data_caxpby.x_test[i]; + + // Copy vector y for cscal + for (i = 0; i < n * incy_abs * 2; i++) + data_caxpby.y_verify[i] = data_caxpby.y_test[i]; + + // Find beta*y + BLASFUNC(cscal)(&n, beta, data_caxpby.y_verify, &incy_abs); + + // Find sum of alpha*x and beta*y + BLASFUNC(caxpy)(&n, alpha, data_caxpby.x_verify, &incx, + data_caxpby.y_verify, &incy); + + BLASFUNC(caxpby)(&n, alpha, data_caxpby.x_test, &incx, + beta, data_caxpby.y_test, &incy); + + // Find the differences between output vector caculated by caxpby and caxpy + for (i = 0; i < n * incy_abs * 2; i++) + data_caxpby.y_test[i] -= data_caxpby.y_verify[i]; + + // Find the norm of differences + return BLASFUNC(scnrm2)(&n, data_caxpby.y_test, &incy_abs); +} + +/** + * C API specific function + * Test caxpby by comparing it with cscal and caxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static float c_api_check_caxpby(blasint n, float *alpha, blasint incx, float *beta, blasint incy) +{ + blasint i; + + // cscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + srand_generate(data_caxpby.x_test, n * incx_abs * 2); + srand_generate(data_caxpby.y_test, n * incy_abs * 2); + + // Copy vector x for caxpy + for (i = 0; i < n * incx_abs * 2; i++) + data_caxpby.x_verify[i] = data_caxpby.x_test[i]; + + // Copy vector y for cscal + for (i = 0; i < n * incy_abs * 2; i++) + data_caxpby.y_verify[i] = data_caxpby.y_test[i]; + + // Find beta*y + cblas_cscal(n, beta, data_caxpby.y_verify, incy_abs); + + // Find sum of alpha*x and beta*y + cblas_caxpy(n, alpha, data_caxpby.x_verify, incx, + data_caxpby.y_verify, incy); + + cblas_caxpby(n, alpha, data_caxpby.x_test, incx, + beta, data_caxpby.y_test, incy); + + // Find the differences between output vector caculated by caxpby and caxpy + for (i = 0; i < n * incy_abs * 2; i++) + data_caxpby.y_test[i] -= data_caxpby.y_verify[i]; + + // Find the norm of differences + return cblas_scnrm2(n, data_caxpby.y_test, incy_abs); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(caxpby, inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(caxpby, inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(caxpby, inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(caxpby, inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha[] = {3.0f, 1.0f}; + float beta[] = {4.0f, 3.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(caxpby, inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + float alpha[] = {5.0f, 2.2f}; + float beta[] = {4.0f, 5.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(caxpby, inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {6.0f, 3.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(caxpby, inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + float alpha[] = {7.0f, 2.0f}; + float beta[] = {3.5f, 1.3f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(caxpby, inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(caxpby, inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(caxpby, inc_x_1_inc_y_1_N_100_a_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(caxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(caxpby, check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(caxpby, c_api_inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(caxpby, c_api_inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 2.1f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(caxpby, c_api_inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha[] = {3.0f, 2.0f}; + float beta[] = {4.0f, 3.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(caxpby, c_api_inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + float alpha[] = {5.0f, 2.0f}; + float beta[] = {4.0f, 3.1f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(caxpby, c_api_inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {6.0f, 2.3f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(caxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + float alpha[] = {7.0f, 1.0f}; + float beta[] = {3.5f, 1.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_a_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test caxpby by comparing it with cscal and caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(caxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(caxpby, c_api_check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = c_api_check_caxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif diff --git a/utest/test_extensions/test_caxpyc.c b/utest/test_extensions/test_caxpyc.c new file mode 100644 index 000000000..ed1899e57 --- /dev/null +++ b/utest/test_extensions/test_caxpyc.c @@ -0,0 +1,158 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CAXPYC { + float x_test[DATASIZE * INCREMENT * 2]; + float x_verify[DATASIZE * INCREMENT * 2]; + float y_test[DATASIZE * INCREMENT * 2]; + float y_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CAXPYC data_caxpyc; + +/** + * Test caxpyc by conjugating vector x and comparing with caxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param incy - increment for the elements of y + * return norm of difference + */ +static float check_caxpyc(blasint n, float *alpha, blasint incx, blasint incy) +{ + blasint i; + + srand_generate(data_caxpyc.x_test, n * incx * 2); + srand_generate(data_caxpyc.y_test, n * incy * 2); + + for (i = 0; i < n * incx * 2; i++) + data_caxpyc.x_verify[i] = data_caxpyc.x_test[i]; + + for (i = 0; i < n * incy * 2; i++) + data_caxpyc.y_verify[i] = data_caxpyc.y_test[i]; + + cconjugate_vector(n, incx, data_caxpyc.x_verify); + + BLASFUNC(caxpy)(&n, alpha, data_caxpyc.x_verify, &incx, + data_caxpyc.y_verify, &incy); + + BLASFUNC(caxpyc)(&n, alpha, data_caxpyc.x_test, &incx, + data_caxpyc.y_test, &incy); + + for (i = 0; i < n * incy * 2; i++) + data_caxpyc.y_verify[i] -= data_caxpyc.y_test[i]; + + return BLASFUNC(scnrm2)(&n, data_caxpyc.y_verify, &incy); +} + +/** + * Test caxpyc by conjugating vector x and comparing with caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(caxpyc, conj_strides_one) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha[] = {5.0f, 2.2f}; + + float norm = check_caxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test caxpyc by conjugating vector x and comparing with caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(caxpyc, conj_incx_one) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha[] = {5.0f, 2.2f}; + + float norm = check_caxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test caxpyc by conjugating vector x and comparing with caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(caxpyc, conj_incy_one) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha[] = {5.0f, 2.2f}; + + float norm = check_caxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test caxpyc by conjugating vector x and comparing with caxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(caxpyc, conj_strides_two) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha[] = {5.0f, 2.2f}; + + float norm = check_caxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif diff --git a/utest/test_extensions/test_cgbmv.c b/utest/test_extensions/test_cgbmv.c new file mode 100644 index 000000000..8e0640c5d --- /dev/null +++ b/utest/test_extensions/test_cgbmv.c @@ -0,0 +1,279 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 1 + +struct DATA_CGBMV { + float a_test[DATASIZE * DATASIZE * 2]; + float a_band_storage[DATASIZE * DATASIZE * 2]; + float matrix[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * 2 * INCREMENT]; + float c_test[DATASIZE * 2 * INCREMENT]; + float c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CGBMV data_cgbmv; + +/** + * Transform full-storage band matrix A to band-packed storage mode. + * + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * output param a - buffer for holding band-packed matrix + * param lda - specifies the leading dimension of a + * param matrix - buffer holding full-storage band matrix A + * param ldm - specifies the leading full-storage band matrix A + */ +static void transform_to_band_storage(blasint m, blasint n, blasint kl, + blasint ku, float* a, blasint lda, + float* matrix, blasint ldm) +{ + blasint i, j, k; + for (j = 0; j < n; j++) + { + k = 2 * (ku - j); + for (i = MAX(0, 2*(j - ku)); i < MIN(m, j + kl + 1) * 2; i+=2) + { + a[(k + i) + j * lda * 2] = matrix[i + j * ldm * 2]; + a[(k + i) + j * lda * 2 + 1] = matrix[i + j * ldm * 2 + 1]; + } + } +} + +/** + * Generate full-storage band matrix A with kl sub-diagonals and ku super-diagonals + * + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * output param band_matrix - buffer for full-storage band matrix. + * param matrix - buffer holding input general matrix + * param ldm - specifies the leading of input general matrix +*/ +static void get_band_matrix(blasint m, blasint n, blasint kl, blasint ku, + float *band_matrix, float *matrix, blasint ldm) +{ + blasint i, j; + blasint k = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < m * 2; j += 2) + { + if ((blasint)(j/2) > kl + i || i > ku + (blasint)(j/2)) + { + band_matrix[i * ldm * 2 + j] = 0.0f; + band_matrix[i * ldm * 2 + j + 1] = 0.0f; + continue; + } + + band_matrix[i * ldm * 2 + j] = matrix[k++]; + band_matrix[i * ldm * 2 + j + 1] = matrix[k++]; + } + } +} + +/** + * Comapare results computed by cgbmv and cgemv + * since gbmv is gemv for band matrix + * + * param trans specifies op(A), the transposition operation applied to A + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * param alpha - scaling factor for the matrix-vector product + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static float check_cgbmv(char trans, blasint m, blasint n, blasint kl, blasint ku, + float *alpha, blasint lda, blasint inc_b, float *beta, blasint inc_c) +{ + blasint i; + blasint lenb, lenc; + + if(trans == 'T' || trans == 'C' || trans == 'D' || trans == 'U'){ + lenb = m; + lenc = n; + } else { + lenb = n; + lenc = m; + } + + srand_generate(data_cgbmv.matrix, m * n * 2); + srand_generate(data_cgbmv.b_test, 2 * (1 + (lenb - 1) * inc_b)); + srand_generate(data_cgbmv.c_test, 2 * (1 + (lenc - 1) * inc_c)); + + for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++) + data_cgbmv.c_verify[i] = data_cgbmv.c_test[i]; + + get_band_matrix(m, n, kl, ku, data_cgbmv.a_test, data_cgbmv.matrix, m); + + transform_to_band_storage(m, n, kl, ku, data_cgbmv.a_band_storage, lda, data_cgbmv.a_test, m); + + BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgbmv.a_test, &m, data_cgbmv.b_test, + &inc_b, beta, data_cgbmv.c_verify, &inc_c); + + BLASFUNC(cgbmv)(&trans, &m, &n, &kl, &ku, alpha, data_cgbmv.a_band_storage, &lda, data_cgbmv.b_test, + &inc_b, beta, data_cgbmv.c_test, &inc_c); + + for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++) + data_cgbmv.c_verify[i] -= data_cgbmv.c_test[i]; + + return BLASFUNC(scnrm2)(&lenc, data_cgbmv.c_verify, &inc_c); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is D + */ +CTEST(cgbmv, trans_D) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'D'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is O + */ +CTEST(cgbmv, trans_O) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 10; + blasint lda = 50; + char trans = 'O'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is S + */ +CTEST(cgbmv, trans_S) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 6, ku = 9; + blasint lda = 50; + char trans = 'S'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is U + */ +CTEST(cgbmv, trans_U) +{ + blasint m = 25, n = 50; + blasint inc_b = 1, inc_c = 1; + blasint kl = 7, ku = 11; + blasint lda = kl + ku + 1; + char trans = 'U'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is C + */ +CTEST(cgbmv, trans_C) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'C'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgbmv by comparing it against cgemv + * with param trans is R + */ +CTEST(cgbmv, trans_R) +{ + blasint m = 50, n = 100; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'R'; + + float alpha[] = {7.0f, 1.0f}; + float beta[] = {1.5f, -1.5f}; + + float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} +#endif diff --git a/utest/test_extensions/test_cgeadd.c b/utest/test_extensions/test_cgeadd.c new file mode 100644 index 000000000..0cf6cbf87 --- /dev/null +++ b/utest/test_extensions/test_cgeadd.c @@ -0,0 +1,880 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 + +struct DATA_CGEADD { + float a_test[M * N * 2]; + float c_test[M * N * 2]; + float c_verify[M * N * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CGEADD data_cgeadd; + +/** + * cgeadd reference implementation + * + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param aptr - refer to matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param cptr - refer to matrix C + * param ldc - leading dimension of C + */ +static void cgeadd_trusted(blasint m, blasint n, float *alpha, float *aptr, + blasint lda, float *beta, float *cptr, blasint ldc) +{ + blasint i; + + lda *= 2; + ldc *= 2; + + for (i = 0; i < n; i++) + { + cblas_caxpby(m, alpha, aptr, 1, beta, cptr, 1); + aptr += lda; + cptr += ldc; + } +} + +/** + * Test cgeadd by comparing it against reference + * Compare with the following options: + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static float check_cgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, float *alpha, blasint lda, + float *beta, blasint ldc) +{ + blasint i; + blasint cols = m, rows = n; + + if (order == CblasRowMajor) + { + rows = m; + cols = n; + } + + // Fill matrix A, C + srand_generate(data_cgeadd.a_test, lda * rows * 2); + srand_generate(data_cgeadd.c_test, ldc * rows * 2); + + // Copy matrix C for cgeadd + for (i = 0; i < ldc * rows * 2; i++) + data_cgeadd.c_verify[i] = data_cgeadd.c_test[i]; + + cgeadd_trusted(cols, rows, alpha, data_cgeadd.a_test, lda, + beta, data_cgeadd.c_verify, ldc); + + if (api == 'F') + BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda, + beta, data_cgeadd.c_test, &ldc); + else + cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda, + beta, data_cgeadd.c_test, ldc); + + // Find the differences between output matrix caculated by cgeadd and sgemm + return smatrix_difference(data_cgeadd.c_test, data_cgeadd.c_verify, cols, rows, ldc*2); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param lda - leading dimension of A + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in cgeadd + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, blasint lda, + blasint ldc, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + set_xerbla("CGEADD ", expected_info); + + if (api == 'F') + BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda, + beta, data_cgeadd.c_test, &ldc); + else + cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda, + beta, data_cgeadd.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(cgeadd, matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {3.0f, 2.0f}; + float beta[] = {1.0f, 3.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(cgeadd, matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.5f, 1.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(cgeadd, matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {3.0f, 1.5f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(cgeadd, matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(cgeadd, matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n - + * number of columns of A and C + * Must be at least zero. + */ +CTEST(cgeadd, xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = m; + blasint ldc = m; + + int expected_info = 2; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific tests + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + */ +CTEST(cgeadd, xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + */ +CTEST(cgeadd, xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 6; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + */ +CTEST(cgeadd, xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Check if n - number of columns of A, C equal zero. + */ +CTEST(cgeadd, n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Check if m - number of rows of A and C equal zero. + */ +CTEST(cgeadd, m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(cgeadd, c_api_matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 3.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(cgeadd, c_api_matrix_n_100_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {4.0f, 1.5f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(cgeadd, c_api_matrix_n_50_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N / 2; + blasint m = M; + + blasint lda = n; + blasint ldc = n; + + float alpha[] = {3.0f, 2.5f}; + float beta[] = {1.0f, 2.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(cgeadd, c_api_matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {0.0f, 0.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(cgeadd, c_api_matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {3.0f, 1.5f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(cgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {0.0f, 0.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgeadd by comparing it against sgemm + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(cgeadd, c_api_matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + float alpha[] = {2.0f, 3.0f}; + float beta[] = {2.0f, 4.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test error function for an invalid param order - + * specifies whether A and C stored in + * row-major order or column-major order + */ +CTEST(cgeadd, c_api_xerbla_invalid_order) +{ + CBLAS_ORDER order = INVALID; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 0; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(cgeadd, c_api_xerbla_n_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(cgeadd, c_api_xerbla_m_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(cgeadd, c_api_xerbla_lda_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(cgeadd, c_api_xerbla_ldc_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Check if n - number of columns of A, C equal zero. + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Check if m - number of rows of A and C equal zero. + * + * c api option order is column-major order + */ +CTEST(cgeadd, c_api_m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_cgemm.c b/utest/test_extensions/test_cgemm.c new file mode 100644 index 000000000..cd38d710b --- /dev/null +++ b/utest/test_extensions/test_cgemm.c @@ -0,0 +1,273 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CGEMM { + float a_test[DATASIZE * DATASIZE * 2]; + float a_verify[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * DATASIZE * 2]; + float b_verify[DATASIZE * DATASIZE * 2]; + float c_test[DATASIZE * DATASIZE * 2]; + float c_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CGEMM data_cgemm; + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * + * param transa specifies op(A), the transposition (conjugation) operation applied to A + * param transb specifies op(B), the transposition (conjugation) operation applied to B + * param m specifies the number of rows of the matrix op(A) and of the matrix C + * param n specifies the number of columns of the matrix op(B) and the number of columns of the matrix C + * param k specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of matrix A + * param ldb - leading dimension of matrix B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of matrix C + * return norm of difference + */ +static float check_cgemm(char transa, char transb, blasint m, blasint n, blasint k, + float *alpha, blasint lda, blasint ldb, float *beta, blasint ldc) +{ + blasint i; + float alpha_conj[] = {1.0f, 0.0f}; + char transa_verify = transa; + char transb_verify = transb; + + int arows = k, acols = m; + int brows = n, bcols = k; + + if (transa == 'T' || transa == 'C'){ + arows = m; acols = k; + } + + if (transb == 'T' || transb == 'C'){ + brows = k; bcols = n; + } + + srand_generate(data_cgemm.a_test, arows * lda * 2); + srand_generate(data_cgemm.b_test, brows * ldb * 2); + srand_generate(data_cgemm.c_test, n * ldc * 2); + + for (i = 0; i < arows * lda * 2; i++) + data_cgemm.a_verify[i] = data_cgemm.a_test[i]; + + for (i = 0; i < brows * ldb * 2; i++) + data_cgemm.b_verify[i] = data_cgemm.b_test[i]; + + for (i = 0; i < n * ldc * 2; i++) + data_cgemm.c_verify[i] = data_cgemm.c_test[i]; + + if (transa == 'R'){ + cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_cgemm.a_verify, lda, lda); + transa_verify = 'N'; + } + + if (transb == 'R'){ + cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_cgemm.b_verify, ldb, ldb); + transb_verify = 'N'; + } + + BLASFUNC(cgemm)(&transa_verify, &transb_verify, &m, &n, &k, alpha, data_cgemm.a_verify, &lda, + data_cgemm.b_verify, &ldb, beta, data_cgemm.c_verify, &ldc); + + BLASFUNC(cgemm)(&transa, &transb, &m, &n, &k, alpha, data_cgemm.a_test, &lda, + data_cgemm.b_test, &ldb, beta, data_cgemm.c_test, &ldc); + + return smatrix_difference(data_cgemm.c_test, data_cgemm.c_verify, m, n, ldc*2); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is conjugate and transposed + * matrix B is conjugate and not transposed + */ +CTEST(cgemm, conjtransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'C'; + char transb = 'R'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is not conjugate and not transposed + * matrix B is conjugate and not transposed + */ +CTEST(cgemm, notransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'N'; + char transb = 'R'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is conjugate and transposed + */ +CTEST(cgemm, conjnotransa_conjtransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'C'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is not conjugate and not transposed + */ +CTEST(cgemm, conjnotransa_notransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'N'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is conjugate and not transposed + */ +CTEST(cgemm, conjnotransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'R'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is transposed + */ +CTEST(cgemm, conjnotransa_transb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'T'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test cgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate cgemm. + * Test with the following options: + * + * matrix A is transposed + * matrix B is conjugate and not transposed + */ +CTEST(cgemm, transa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'T'; + char transb = 'R'; + float alpha[] = {-2.0, 1.0f}; + float beta[] = {1.0f, -1.0f}; + + float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_cgemmt.c b/utest/test_extensions/test_cgemmt.c new file mode 100644 index 000000000..ed9279933 --- /dev/null +++ b/utest/test_extensions/test_cgemmt.c @@ -0,0 +1,2010 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_CGEMMT { + float a_test[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * DATASIZE * 2]; + float c_test[DATASIZE * DATASIZE * 2]; + float c_verify[DATASIZE * DATASIZE * 2]; + float c_gemm[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CGEMMT data_cgemmt; + +/** + * Compute gemmt via gemm since gemmt is gemm but updates only + * the upper or lower triangular part of the result matrix + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + */ +static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, float *alpha, blasint lda, + blasint ldb, float *beta, blasint ldc) +{ + blasint i, j; + + if(api == 'F') + BLASFUNC(cgemm)(&transa, &transb, &m, &m, &k, alpha, data_cgemmt.a_test, &lda, + data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_gemm, &ldc); + else + cblas_cgemm(order, transa, transb, m, m, k, alpha, data_cgemmt.a_test, lda, + data_cgemmt.b_test, ldb, beta, data_cgemmt.c_gemm, ldc); + + ldc *= 2; + + if (uplo == 'L' || uplo == CblasLower) + { + for (i = 0; i < m; i++) + for (j = i * 2; j < m * 2; j+=2){ + data_cgemmt.c_verify[i * ldc + j] = + data_cgemmt.c_gemm[i * ldc + j]; + data_cgemmt.c_verify[i * ldc + j + 1] = + data_cgemmt.c_gemm[i * ldc + j + 1]; + } + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i * 2; j+=2){ + data_cgemmt.c_verify[i * ldc + j] = + data_cgemmt.c_gemm[i * ldc + j]; + data_cgemmt.c_verify[i * ldc + j + 1] = + data_cgemmt.c_gemm[i * ldc + j + 1]; + } + } +} + +/** + * Comapare results computed by cgemmt and cgemmt_trusted + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static float check_cgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, float *alpha, blasint lda, + blasint ldb, float *beta, blasint ldc) +{ + blasint i; + blasint b_cols; + blasint a_cols; + blasint inc = 1; + blasint size_c = m * ldc * 2; + + if(order == CblasColMajor){ + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = m; + else a_cols = k; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = k; + else b_cols = m; + } else { + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = k; + else a_cols = m; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = m; + else b_cols = k; + } + + srand_generate(data_cgemmt.a_test, a_cols * lda * 2); + srand_generate(data_cgemmt.b_test, b_cols * ldb * 2); + srand_generate(data_cgemmt.c_test, m * ldc * 2); + + for (i = 0; i < m * ldc * 2; i++) + data_cgemmt.c_gemm[i] = data_cgemmt.c_verify[i] = data_cgemmt.c_test[i]; + + cgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc); + + if (api == 'F') + BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test, + &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc); + else + cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda, + data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc); + + for (i = 0; i < m * ldc * 2; i++) + data_cgemmt.c_verify[i] -= data_cgemmt.c_test[i]; + + return BLASFUNC(snrm2)(&size_c, data_cgemmt.c_verify, &inc) / size_c; +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in cgemmt + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, blasint lda, blasint ldb, + blasint ldc, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + set_xerbla("CGEMMT ", expected_info); + + if (api == 'F') + BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test, + &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc); + else + cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda, + data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + char transa = 'N', transb = 'T'; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'U'; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + char transa = 'R', transb = 'R'; + char uplo = 'U'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'R'; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'R', transb = 'C'; + char uplo = 'U'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, upper_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'C'; + char uplo = 'U'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + char transa = 'N', transb = 'T'; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'L'; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + char transa = 'R', transb = 'R'; + char uplo = 'L'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'R'; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'R', transb = 'C'; + char uplo = 'L'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'C'; + char uplo = 'L'; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_colmajor_upper_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, c_api_colmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, c_api_colmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_colmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, c_api_colmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, c_api_colmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 50, ldc = 25; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {-1.0f, -1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 25, ldc = 25; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjtrans_b_conjtrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, c_api_rowmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, c_api_rowmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 50, ldc = 25; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {-1.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 25, ldc = 25; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 25, ldc = 50; + float alpha[] = {2.0f, 1.0f}; + float beta[] = {-1.0f, 2.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(cgemmt, c_api_rowmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {0.0f, 0.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(cgemmt, c_api_rowmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param uplo. + * Must be upper (U) or lower (L). + */ +CTEST(cgemmt, xerbla_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transa. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(cgemmt, xerbla_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'O', transb = 'N'; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transb. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(cgemmt, xerbla_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'O'; + char uplo = 'U'; + int expected_info = 3; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(cgemmt, xerbla_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 4; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(cgemmt, xerbla_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 5; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(cgemmt, xerbla_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 8; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(cgemmt, xerbla_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 10; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(cgemmt, xerbla_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 13; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. + * Test error function for an invalid param order. + * Must be column or row major. + */ +CTEST(cgemmt, xerbla_c_api_major_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 0; + + int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(cgemmt, xerbla_c_api_colmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B transposed. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(cgemmt, xerbla_c_api_rowmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_cgemv_n.c b/utest/test_extensions/test_cgemv_n.c new file mode 100644 index 000000000..60c9af86a --- /dev/null +++ b/utest/test_extensions/test_cgemv_n.c @@ -0,0 +1,340 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CSPMV_N { + float a_test[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * 2 * INCREMENT]; + float c_test[DATASIZE * 2 * INCREMENT]; + float c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CSPMV_N data_cgemv_n; + +/** + * cgemv not transposed reference code + * + * param trans specifies whether matris A is conj or/and xconj + * param m - number of rows of A + * param n - number of columns of A + * param alpha - scaling factor for the matrib-vector product + * param a - buffer holding input matrib A + * param lda - leading dimension of matrix A + * param b - Buffer holding input vector b + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param c - buffer holding input/output vector c + * param inc_c - stride of vector c + */ +static void cgemv_n_trusted(char trans, blasint m, blasint n, float *alpha, float *a, + blasint lda, float *b, blasint inc_b, float *beta, float *c, + blasint inc_c) +{ + blasint i, j; + blasint i2 = 0; + blasint ib = 0, ic = 0; + + float temp_r, temp_i; + + float *a_ptr = a; + blasint lda2 = 2*lda; + + blasint inc_b2 = 2 * inc_b; + blasint inc_c2 = 2 * inc_c; + + BLASFUNC(cscal)(&m, beta, c, &inc_c); + + for (j = 0; j < n; j++) + { + + if (trans == 'N' || trans == 'R') { + temp_r = alpha[0] * b[ib] - alpha[1] * b[ib+1]; + temp_i = alpha[0] * b[ib+1] + alpha[1] * b[ib]; + } else { + temp_r = alpha[0] * b[ib] + alpha[1] * b[ib+1]; + temp_i = alpha[0] * b[ib+1] - alpha[1] * b[ib]; + } + + ic = 0; + i2 = 0; + + for (i = 0; i < m; i++) + { + if (trans == 'N') { + c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; + c[ic+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; + } + if (trans == 'O') { + c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; + c[ic+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; + } + if (trans == 'R') { + c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; + c[ic+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; + } + if (trans == 'S') { + c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; + c[ic+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; + } + i2 += 2; + ic += inc_c2; + } + a_ptr += lda2; + ib += inc_b2; + } + +} + +/** + * Comapare results computed by cgemv and cgemv_n_trusted + * + * param trans specifies whether matris A is conj or/and xconj + * param m - number of rows of A + * param n - number of columns of A + * param alpha - scaling factor for the matrib-vector product + * param lda - leading dimension of matrix A + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static float check_cgemv_n(char trans, blasint m, blasint n, float *alpha, blasint lda, + blasint inc_b, float *beta, blasint inc_c) +{ + blasint i; + + srand_generate(data_cgemv_n.a_test, n * lda); + srand_generate(data_cgemv_n.b_test, 2 * n * inc_b); + srand_generate(data_cgemv_n.c_test, 2 * m * inc_c); + + for (i = 0; i < m * 2 * inc_c; i++) + data_cgemv_n.c_verify[i] = data_cgemv_n.c_test[i]; + + cgemv_n_trusted(trans, m, n, alpha, data_cgemv_n.a_test, lda, data_cgemv_n.b_test, + inc_b, beta, data_cgemv_n.c_test, inc_c); + BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgemv_n.a_test, &lda, data_cgemv_n.b_test, + &inc_b, beta, data_cgemv_n.c_verify, &inc_c); + + for (i = 0; i < m * 2 * inc_c; i++) + data_cgemv_n.c_verify[i] -= data_cgemv_n.c_test[i]; + + return BLASFUNC(scnrm2)(&n, data_cgemv_n.c_verify, &inc_c); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_o_square_matrix) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows of A is 50 + * Number of colums of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_o_rectangular_matrix_rows_less_then_cols) +{ + blasint n = 100, m = 50, lda = 50; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows of A is 100 + * Number of colums of A is 50 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_o_rectangular_matrix_cols_less_then_rows) +{ + blasint n = 50, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(cgemv, trans_o_double_strides) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 2, inc_c = 2; + char trans = 'O'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_s_square_matrix) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows of A is 50 + * Number of colums of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_s_rectangular_matrix_rows_less_then_cols) +{ + blasint n = 100, m = 50, lda = 50; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows of A is 100 + * Number of colums of A is 50 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cgemv, trans_s_rectangular_matrix_cols_less_then_rows) +{ + blasint n = 50, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.4f, 0.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(cgemv, trans_s_double_strides) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 2, inc_c = 2; + char trans = 'S'; + float alpha[] = {2.0f, -1.0f}; + float beta[] = {1.0f, 5.0f}; + + float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +#endif diff --git a/utest/test_extensions/test_cgemv_t.c b/utest/test_extensions/test_cgemv_t.c new file mode 100644 index 000000000..aa3281e66 --- /dev/null +++ b/utest/test_extensions/test_cgemv_t.c @@ -0,0 +1,1132 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 +#define INCREMENT 2 + +struct DATA_CGEMV_T { + float a_test[N * M * 2]; + float a_verify[N * M * 2]; + float y_test[M * INCREMENT * 2]; + float y_verify[M * INCREMENT * 2]; + float x_test[M * INCREMENT * 2]; + float x_verify[M * INCREMENT * 2]; +}; + +// SINGLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * FLT_EPSILON +// SINGLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 1.19e-07 = 5*e-03 +#define SINGLE_EPS_ZGEMV 5e-03 + +#ifdef BUILD_COMPLEX +static struct DATA_CGEMV_T data_cgemv_t; + +/** + * Find product of matrix-vector multiplication + * + * param n specifies number of columns of A + * param m specifies number of rows of A and size of vector x + * param lda specifies leading dimension of A + * param inc_x specifies increment of vector x + */ +static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x) +{ + blasint i; + float *a_ptr = data_cgemv_t.a_verify; + float *x_ptr = data_cgemv_t.x_test; + float *x_res = data_cgemv_t.x_verify; + + openblas_complex_float result; + + for (i = 0; i < n * inc_x; i+= inc_x) + { + result = cblas_cdotu(lda, a_ptr, 1, x_ptr, inc_x); + x_res[0] = CREAL(result); + x_res[1] = CIMAG(result); + a_ptr += lda * 2; + x_res += 2 * inc_x; + } +} + +/** + * Test cgemv by comparing it against comatcopy, caxpby and + * reference func matrix_vector_product + * + * comatcopy perform operation: op(A) + * matrix_vector_product perform operation: A*x + * caxpby perform operation: alpha*x + beta*y + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param alpha specifies scalar alpha + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param beta specifies scalar beta + * param inc_y specifies increment for vector y + * return norm of difference between cgemv and result of reference funcs + */ +static float check_cgemv(char api, char order, char trans, blasint m, blasint n, float *alpha, + blasint lda, blasint inc_x, float *beta, blasint inc_y) +{ + blasint i; + + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + // Transpose parameters for comatcopy + // cgemv_t perform operation on transposed matrix, no need to transpose a_verify + char trans_copy; + char ctrans_copy; + + // Param alpha for comatcopy, scale on alpha perform caxpby + float alpha_one[] = {1.0f, 0.0f}; + + memset(data_cgemv_t.x_verify, 0.0f, m * inc_x * 2 * sizeof(float)); + + // Fill matrix A, vectors x, y + srand_generate(data_cgemv_t.a_test, lda * n * 2); + srand_generate(data_cgemv_t.x_test, m * inc_x * 2); + srand_generate(data_cgemv_t.y_test, m * inc_y * 2); + + // Copy vector y for reference funcs + for (int i = 0; i < m * inc_y * 2; i++) { + data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i]; + } + + if (api == 'F') { + if (trans == 'T') trans_copy = 'N'; + if (trans == 'C') trans_copy = 'R'; + if (trans == 'U') trans_copy = 'R'; + if (trans == 'D') trans_copy = 'N'; + + // Perform operation: op(A) + BLASFUNC(comatcopy)(&order, &trans_copy, &m, &n, alpha_one, data_cgemv_t.a_test, &lda, data_cgemv_t.a_verify, &lda); + + // Find A*x + matrix_vector_product(n, m, lda, inc_x); + + // Find conj(x) + if (trans == 'U' || trans == 'D') { + cconjugate_vector(m, inc_x, data_cgemv_t.x_verify); + } + + // Find alpha*x+beta*y + BLASFUNC(caxpby)(&n, alpha, data_cgemv_t.x_verify, &inc_x, beta, data_cgemv_t.y_verify, &inc_y); + + BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgemv_t.a_test, + &lda, data_cgemv_t.x_test, &inc_x, beta, data_cgemv_t.y_test, &inc_y); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') {ctrans = CblasTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasTrans : CblasNoTrans;} + if (trans == 'N') {ctrans = CblasNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasNoTrans : CblasTrans;} + if (trans == 'C') {ctrans = CblasConjTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjTrans : CblasConjNoTrans;} + if (trans == 'R') {ctrans = CblasConjNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjNoTrans : CblasConjTrans;} + + // Perform operation: op(A) + cblas_comatcopy(corder, ctrans_copy, m, n, alpha_one, data_cgemv_t.a_test, lda, data_cgemv_t.a_verify, lda); + + // Find A*x + matrix_vector_product(n, m, lda, inc_x); + + // Find alpha*x+beta*y + cblas_caxpby(n, alpha, data_cgemv_t.x_verify, inc_x, beta, data_cgemv_t.y_verify, inc_y); + + cblas_cgemv(corder, ctrans, m, n, alpha, data_cgemv_t.a_test, + lda, data_cgemv_t.x_test, inc_x, beta, data_cgemv_t.y_test, inc_y); + } + + // Find the differences between output vector caculated by cgemv and reference funcs + for (i = 0; i < m * inc_y * 2; i++) + data_cgemv_t.y_test[i] -= data_cgemv_t.y_verify[i]; + + // Find the norm of differences + return cblas_scnrm2(m, data_cgemv_t.y_test, inc_y); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param inc_y specifies increment for vector y + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint m, blasint n, + blasint lda, blasint inc_x, blasint inc_y, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + float a[] = {1.0f, 1.0f}; + float x[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + float y[] = {1.0f, 1.0f}; + + set_xerbla("CGEMV ", expected_info); + + BLASFUNC(cgemv)(&trans, &m, &n, alpha, a, &lda, x, &inc_x, beta, y, &inc_y); + + return check_error(); +} + +/** + * C API specific function + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param inc_y specifies increment for vector y + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int c_api_check_badargs(CBLAS_ORDER corder, CBLAS_TRANSPOSE ctrans, blasint m, blasint n, + blasint lda, blasint inc_x, blasint inc_y, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + float a[] = {1.0f, 1.0f}; + float x[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + float y[] = {1.0f, 1.0f}; + + set_xerbla("CGEMV ", expected_info); + + cblas_cgemv(corder, ctrans, m, n, alpha, a, lda, x, inc_x, beta, y, inc_y); + + return check_error(); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 2.0f, beta_i = 2.0f + */ +CTEST(cgemv, colmajor_trans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 2.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 2.0f, beta_i = 2.0f + */ +CTEST(cgemv, colmajor_trans_col_100_row_100_inc_x_2_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {2.0f, 2.0f}; + + blasint inc_x = 2; + blasint inc_y = 1; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 2.0f, beta_i = 1.0f + */ +CTEST(cgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 2 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 2.0f, beta_i = 1.0f + */ +CTEST(cgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + blasint inc_x = 1; + blasint inc_y = 2; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and x conjugate + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 2.0f, beta_i = 1.0f + */ +CTEST(cgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and x conjugate + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 1.0f, alpha_i = 2.0f + * beta_r = 1.0f, beta_i = 1.0f + */ +CTEST(cgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_2_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'U'; + + float alpha[] = {1.0f, 2.0f}; + float beta[] = {1.0f, 1.0f}; + + blasint inc_x = 2; + blasint inc_y = 2; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + + +/** + * Fortran API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition, conjugate A, conjugate x + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 2.0f + */ +CTEST(cgemv, colmajor_conjtrans_x_conj_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'D'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 2.0f}; + + blasint inc_x = 1; + blasint inc_y = 2; + + float norm = check_cgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition, conjugate A, conjugate x + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 2.0f + */ +CTEST(cgemv, c_api_colmajor_trans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 2.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 2.0f + */ +CTEST(cgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 2.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 2 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 2.0f + */ +CTEST(cgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 2.0f}; + + blasint inc_x = 1; + blasint inc_y = 2; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Row Major + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 1.0f + */ +CTEST(cgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'N'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Row Major + * No trans + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 3.0f, beta_i = 2.0f + */ +CTEST(cgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_2_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'N'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {3.0f, 1.0f}; + + blasint inc_x = 2; + blasint inc_y = 2; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Column Major + * Conjugate + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 1.0f, alpha_i = 3.0f + * beta_r = 1.0f, beta_i = 2.5f + */ +CTEST(cgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'R'; + + float alpha[] = {1.0f, 3.0f}; + float beta[] = {1.0f, 2.5f}; + + blasint inc_x = 1; + blasint inc_y = 1; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test cgemv by comparing it against reference + * with the following options: + * + * Row Major + * Conjugate + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0f, alpha_i = 1.0f + * beta_r = 1.0f, beta_i = 1.5f + */ +CTEST(cgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_2_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'R'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.5f}; + + blasint inc_x = 2; + blasint inc_y = 1; + + float norm = check_cgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_inc_y) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_inc_y_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Row major + */ +CTEST(cgemv, c_api_xerbla_invalid_inc_y_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_inc_x) +{ + char order = 'C'; + char trans = 'T'; + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_inc_x_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Row major + */ +CTEST(cgemv, c_api_xerbla_invalid_inc_x_row_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_n) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_n_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Row major + */ +CTEST(cgemv, c_api_xerbla_invalid_n_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_m) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_m_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Row major + */ +CTEST(cgemv, c_api_xerbla_invalid_m_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * lda must be at least n. + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_lda) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda. + * If matrices are stored using col major layout, + * lda must be at least m. + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_lda_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda. + * If matrices are stored using col major layout, + * lda must be at least n. + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_lda_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param trans. + * + * Column major + */ +CTEST(cgemv, xerbla_invalid_trans) +{ + char order = 'C'; + char trans = 'Z'; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param trans. + * + * Column major + */ +CTEST(cgemv, c_api_xerbla_invalid_trans_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = INVALID; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param trans. + * + * Row major + */ +CTEST(cgemv, c_api_xerbla_invalid_trans_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = INVALID; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param order. + */ +CTEST(cgemv, c_api_xerbla_invalid_order_col_major) +{ + enum CBLAS_ORDER corder = INVALID; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 0; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_cimatcopy.c b/utest/test_extensions/test_cimatcopy.c new file mode 100644 index 000000000..800f8a2d1 --- /dev/null +++ b/utest/test_extensions/test_cimatcopy.c @@ -0,0 +1,850 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_CIMATCOPY { + float a_test[DATASIZE * DATASIZE * 2]; + float a_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CIMATCOPY data_cimatcopy; + +/** + * Comapare results computed by cimatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * return norm of difference between openblas and reference func + */ +static float check_cimatcopy(char api, char order, char trans, blasint rows, blasint cols, float *alpha, + blasint lda_src, blasint lda_dst) +{ + blasint m, n; + blasint rows_out, cols_out; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + int conj = -1; + + if (order == 'C') { + n = rows; m = cols; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + rows_out = n; cols_out = m*2; + if (trans == 'C') + conj = 1; + } + else { + rows_out = m; cols_out = n*2; + if (trans == 'R') + conj = 1; + } + + srand_generate(data_cimatcopy.a_test, lda_src*m*2); + + if (trans == 'T' || trans == 'C') { + ctranspose(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); + } + else { + ccopy(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); + } + + if (api == 'F') { + BLASFUNC(cimatcopy)(&order, &trans, &rows, &cols, alpha, data_cimatcopy.a_test, + &lda_src, &lda_dst); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_cimatcopy(corder, ctrans, rows, cols, alpha, data_cimatcopy.a_test, + lda_src, lda_dst); + } + + // Find the differences between output matrix computed by cimatcopy and reference func + return smatrix_difference(data_cimatcopy.a_test, data_cimatcopy.a_verify, cols_out, rows_out, 2*lda_dst); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda_src, blasint lda_dst, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + + set_xerbla("CIMATCOPY", expected_info); + + BLASFUNC(cimatcopy)(&order, &trans, &rows, &cols, alpha, data_cimatcopy.a_test, + &lda_src, &lda_dst); + + return check_error(); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha_r = -3.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha[] = {-3.0f, 1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, colmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'C'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, colmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific tests + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, colmajor_conjtrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'C'; + float alpha[] = {1.0f, 1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, colmajor_conj_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 50; + char order = 'C'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha_r = 2.0, alpha_i = 3.0 + */ +CTEST(cimatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha[] = {2.0f, 3.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific tests + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific tests + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, rowmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(cimatcopy, rowmajor_conj_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 50; + char order = 'R'; + char trans = 'R'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, rowmajor_conjtrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha_r = 3.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha[] = {3.0f, 2.0f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha_r = 3.0, alpha_i = 1.5 + */ +CTEST(cimatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha[] = {3.0f, 1.5f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha_r = 3.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha[] = {3.0f, 1.0f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, c_api_colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, c_api_colmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'C'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha_r = 1.0, alpha_i = 1.0 + */ +CTEST(cimatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha[] = {1.0f, 1.0f}; + + float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(cimatcopy, c_api_rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'R'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(cimatcopy, c_api_rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(cimatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(cimatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param m. + * Must be positive. + */ +CTEST(cimatcopy, xerbla_invalid_rows) +{ + blasint m = 0, n = 100; + blasint lda_src = 0, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param n. + * Must be positive. + */ +CTEST(cimatcopy, xerbla_invalid_cols) +{ + blasint m = 100, n = 0; + blasint lda_src = 100, lda_dst = 0; + char order = 'C'; + char trans = 'T'; + int expected_info = 4; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using row major layout, + * lda_src must be at least n. + */ +CTEST(cimatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using column major layout, + * lda_src must be at least m. + */ +CTEST(cimatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is no transposition, lda_dst must be at least n. + */ +CTEST(cimatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is transposition, lda_dst must be at least m. + */ +CTEST(cimatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is no transposition, lda_dst must be at least m. + */ +CTEST(cimatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is transposition, lda_dst must be at least n. + */ +CTEST(cimatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_comatcopy.c b/utest/test_extensions/test_comatcopy.c new file mode 100644 index 000000000..8a3d5ee7b --- /dev/null +++ b/utest/test_extensions/test_comatcopy.c @@ -0,0 +1,728 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_COMATCOPY { + float a_test[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * DATASIZE * 2]; + float b_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_COMATCOPY data_comatcopy; + +/** + * Comapare results computed by comatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param alpha - scaling factor for matrix B + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * return norm of difference between openblas and reference func + */ +static float check_comatcopy(char api, char order, char trans, blasint rows, blasint cols, float* alpha, + blasint lda, blasint ldb) +{ + blasint b_rows, b_cols; + blasint m, n; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + int conj = -1; + + if (order == 'C') { + m = cols; n = rows; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + b_rows = n; b_cols = m*2; + if (trans == 'C') + conj = 1; + } + else { + b_rows = m; b_cols = n*2; + if (trans == 'R') + conj = 1; + } + + srand_generate(data_comatcopy.a_test, lda*m*2); + + if (trans == 'T' || trans == 'C') { + ctranspose(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); + } + else { + ccopy(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); + } + + if (api == 'F') { + BLASFUNC(comatcopy)(&order, &trans, &rows, &cols, alpha, data_comatcopy.a_test, + &lda, data_comatcopy.b_test, &ldb); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_comatcopy(corder, ctrans, rows, cols, alpha, data_comatcopy.a_test, + lda, data_comatcopy.b_test, ldb); + } + + return smatrix_difference(data_comatcopy.b_test, data_comatcopy.b_verify, b_cols, b_rows, ldb*2); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda, blasint ldb, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + + set_xerbla("COMATCOPY", expected_info); + + BLASFUNC(comatcopy)(&order, &trans, &rows, &cols, alpha, data_comatcopy.a_test, + &lda, data_comatcopy.b_test, &ldb); + + return check_error(); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * alpha_r = -1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + float alpha[] = {-1.0f, 2.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(comatcopy, colmajor_conjtrnas_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'C'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'R'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'C'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, c_api_colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * alpha_r = -1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, c_api_colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + float alpha[] = {-1.0f, 2.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, c_api_colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'R'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(comatcopy, c_api_colmajor_conjtrnas_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'C'; + float alpha[] = {2.0f, 1.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, c_api_rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, c_api_rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(comatcopy, c_api_rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'R'; + float alpha[] = {1.5f, -1.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test comatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(comatcopy, c_api_rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'C'; + float alpha[] = {1.0f, 2.0f}; + + float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(comatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(comatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param m. + * Must be positive. + */ +CTEST(comatcopy, xerbla_invalid_rows) +{ + blasint m = 0, n = 100; + blasint lda = 0, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param n. + * Must be positive. + */ +CTEST(comatcopy, xerbla_invalid_cols) +{ + blasint m = 100, n = 0; + blasint lda = 100, ldb = 0; + char order = 'C'; + char trans = 'T'; + int expected_info = 4; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using row major layout, + * lda must be at least n. + */ +CTEST(comatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using column major layout, + * lda must be at least m. + */ +CTEST(comatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is no transposition, ldb must be at least n. + */ +CTEST(comatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is transposition, ldb must be at least m. + */ +CTEST(comatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is no transposition, ldb must be at least n. + */ +CTEST(comatcopy, xerbla_rowmajor_conj_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'R'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is transposition, ldb must be at least m. + */ +CTEST(comatcopy, xerbla_rowmajor_transconj_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'C'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is no transposition, ldb must be at least m. + */ +CTEST(comatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is transposition, ldb must be at least n. + */ +CTEST(comatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is no transposition, ldb must be at least m. + */ +CTEST(comatcopy, xerbla_colmajor_conj_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'R'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is transposition, ldb must be at least n. + */ +CTEST(comatcopy, xerbla_colmajor_transconj_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'C'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_crot.c b/utest/test_extensions/test_crot.c new file mode 100644 index 000000000..1c55216d9 --- /dev/null +++ b/utest/test_extensions/test_crot.c @@ -0,0 +1,792 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CROT { + float x_test[DATASIZE * INCREMENT * 2]; + float y_test[DATASIZE * INCREMENT * 2]; + float x_verify[DATASIZE * INCREMENT * 2]; + float y_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CROT data_crot; + +/** + * Fortran API specific function + * Comapare results computed by csrot and caxpby + * + * param n specifies size of vector x + * param inc_x specifies increment of vector x + * param inc_y specifies increment of vector y + * param c specifies cosine + * param s specifies sine + * return norm of differences + */ +static float check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c, float *s) +{ + blasint i; + float norm = 0; + float s_neg[] = {-s[0], s[1]}; + + blasint inc_x_abs = labs(inc_x); + blasint inc_y_abs = labs(inc_y); + + // Fill vectors x, y + srand_generate(data_crot.x_test, n * inc_x_abs * 2); + srand_generate(data_crot.y_test, n * inc_y_abs * 2); + + if (inc_x == 0 && inc_y == 0) { + srand_generate(data_crot.x_test, n * 2); + srand_generate(data_crot.y_test, n * 2); + } + + // Copy vector x for caxpby + for (i = 0; i < n * inc_x_abs * 2; i++) + data_crot.x_verify[i] = data_crot.x_test[i]; + + // Copy vector y for caxpby + for (i = 0; i < n * inc_y_abs * 2; i++) + data_crot.y_verify[i] = data_crot.y_test[i]; + + // Find cx = c*x + s*y + BLASFUNC(caxpby)(&n, s, data_crot.y_test, &inc_y, c, data_crot.x_verify, &inc_x); + + // Find cy = -conjg(s)*x + c*y + BLASFUNC(caxpby)(&n, s_neg, data_crot.x_test, &inc_x, c, data_crot.y_verify, &inc_y); + + BLASFUNC(csrot)(&n, data_crot.x_test, &inc_x, data_crot.y_test, &inc_y, c, s); + + // Find the differences between vector x caculated by caxpby and csrot + for (i = 0; i < n * 2 * inc_x_abs; i++) + data_crot.x_test[i] -= data_crot.x_verify[i]; + + // Find the differences between vector y caculated by caxpby and csrot + for (i = 0; i < n * 2 * inc_y_abs; i++) + data_crot.y_test[i] -= data_crot.y_verify[i]; + + // Find the norm of differences + norm += BLASFUNC(scnrm2)(&n, data_crot.x_test, &inc_x_abs); + norm += BLASFUNC(scnrm2)(&n, data_crot.y_test, &inc_y_abs); + return (norm / 2); +} + +/** + * C API specific function + * Comapare results computed by csrot and caxpby + * + * param n specifies size of vector x + * param inc_x specifies increment of vector x + * param inc_y specifies increment of vector y + * param c specifies cosine + * param s specifies sine + * return norm of differences + */ +static float c_api_check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c, float *s) +{ + blasint i; + float norm = 0; + float s_neg[] = {-s[0], s[1]}; + + blasint inc_x_abs = labs(inc_x); + blasint inc_y_abs = labs(inc_y); + + // Fill vectors x, y + srand_generate(data_crot.x_test, n * inc_x_abs * 2); + srand_generate(data_crot.y_test, n * inc_y_abs * 2); + + if (inc_x == 0 && inc_y == 0) { + srand_generate(data_crot.x_test, n * 2); + srand_generate(data_crot.y_test, n * 2); + } + + // Copy vector x for caxpby + for (i = 0; i < n * inc_x_abs * 2; i++) + data_crot.x_verify[i] = data_crot.x_test[i]; + + // Copy vector y for caxpby + for (i = 0; i < n * inc_y_abs * 2; i++) + data_crot.y_verify[i] = data_crot.y_test[i]; + + // Find cx = c*x + s*y + cblas_caxpby(n, s, data_crot.y_test, inc_y, c, data_crot.x_verify, inc_x); + + // Find cy = -conjg(s)*x + c*y + cblas_caxpby(n, s_neg, data_crot.x_test, inc_x, c, data_crot.y_verify, inc_y); + + cblas_csrot(n, data_crot.x_test, inc_x, data_crot.y_test, inc_y, c[0], s[0]); + + // Find the differences between vector x caculated by caxpby and csrot + for (i = 0; i < n * 2 * inc_x_abs; i++) + data_crot.x_test[i] -= data_crot.x_verify[i]; + + // Find the differences between vector y caculated by caxpby and csrot + for (i = 0; i < n * 2 * inc_y_abs; i++) + data_crot.y_test[i] -= data_crot.y_verify[i]; + + // Find the norm of differences + norm += cblas_scnrm2(n, data_crot.x_test, inc_x_abs); + norm += cblas_scnrm2(n, data_crot.y_test, inc_y_abs); + return (norm / 2); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 0 + * Stride of vector y is 0 + * c = 1.0f + * s = 2.0f + */ +CTEST(crot, inc_x_0_inc_y_0) +{ + blasint n = 100; + + blasint inc_x = 0; + blasint inc_y = 0; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, inc_x_1_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is -1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, inc_x_neg_1_inc_y_neg_1) +{ + blasint n = 100; + + blasint inc_x = -1; + blasint inc_y = -1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * c = 3.0f + * s = 2.0f + */ +CTEST(crot, inc_x_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {3.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, inc_x_neg_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, inc_x_1_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is -2 + * c = 2.0f + * s = 1.0f + */ +CTEST(crot, inc_x_1_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = -2; + + // Imaginary part for caxpby + float c[] = {2.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 2.0f + */ +CTEST(crot, inc_x_2_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, inc_x_neg_2_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = -2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 0.0f + * s = 1.0f + */ +CTEST(crot, inc_x_2_inc_y_2_c_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {0.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 0.0f + */ +CTEST(crot, inc_x_2_inc_y_2_s_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {0.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 0 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, check_n_zero) +{ + blasint n = 0; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 0 + * Stride of vector y is 0 + * c = 1.0f + * s = 2.0f + */ +CTEST(crot, c_api_inc_x_0_inc_y_0) +{ + blasint n = 100; + + blasint inc_x = 0; + blasint inc_y = 0; + + // Imaginary part for caxpby + float c[] = {3.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_1_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is -1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_neg_1_inc_y_neg_1) +{ + blasint n = 100; + + blasint inc_x = -1; + blasint inc_y = -1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * c = 3.0f + * s = 2.0f + */ +CTEST(crot, c_api_inc_x_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {3.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_neg_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_1_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is -2 + * c = 2.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_1_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = -2; + + // Imaginary part for caxpby + float c[] = {2.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 2.0f + */ +CTEST(crot, c_api_inc_x_2_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {2.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_neg_2_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = -2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 0.0f + * s = 1.0f + */ +CTEST(crot, c_api_inc_x_2_inc_y_2_c_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {0.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0f + * s = 0.0f + */ +CTEST(crot, c_api_inc_x_2_inc_y_2_s_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {0.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test crot by comparing it with caxpby. + * Test with the following options: + * + * Size of vectors x, y is 0 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0f + * s = 1.0f + */ +CTEST(crot, c_api_check_n_zero) +{ + blasint n = 0; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for caxpby + float c[] = {1.0f, 0.0f}; + float s[] = {1.0f, 0.0f}; + + float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_crotg.c b/utest/test_extensions/test_crotg.c new file mode 100644 index 000000000..9db7dc7d3 --- /dev/null +++ b/utest/test_extensions/test_crotg.c @@ -0,0 +1,290 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#ifdef BUILD_COMPLEX + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, zero_a) +{ + float sa[2] = {0.0f, 0.0f}; + float sb[2] = {1.0f, 1.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, zero_b) +{ + float sa[2] = {1.0f, 1.0f}; + float sb[2] = {0.0f, 0.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(1.0f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, zero_real) +{ + float sa[2] = {0.0f, 1.0f}; + float sb[2] = {0.0f, 1.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.7071f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.7071f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, positive_real_positive_img) +{ + float sa[2] = {3.0f, 4.0f}; + float sb[2] = {4.0f, 6.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, negative_real_positive_img) +{ + float sa[2] = {-3.0f, 4.0f}; + float sb[2] = {-4.0f, 6.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, positive_real_negative_img) +{ + float sa[2] = {3.0f, -4.0f}; + float sb[2] = {4.0f, -6.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, negative_real_negative_img) +{ + float sa[2] = {-3.0f, -4.0f}; + float sb[2] = {-4.0f, -6.0f}; + float ss[2]; + float sc; + BLASFUNC(crotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_zero_a) +{ + float sa[2] = {0.0f, 0.0f}; + float sb[2] = {1.0f, 1.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_zero_b) +{ + float sa[2] = {1.0f, 1.0f}; + float sb[2] = {0.0f, 0.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(1.0f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_zero_real) +{ + float sa[2] = {0.0f, 1.0f}; + float sb[2] = {0.0f, 1.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.7071f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.7071f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_positive_real_positive_img) +{ + float sa[2] = {3.0f, 4.0f}; + float sb[2] = {4.0f, 6.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_negative_real_positive_img) +{ + float sa[2] = {-3.0f, 4.0f}; + float sb[2] = {-4.0f, 6.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_positive_real_negative_img) +{ + float sa[2] = {3.0f, -4.0f}; + float sb[2] = {4.0f, -6.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); +} + +/** + * C API specific test + * Test crotg by comparing it against pre-calculated values + */ +CTEST(crotg, c_api_negative_real_negative_img) +{ + float sa[2] = {-3.0f, -4.0f}; + float sb[2] = {-4.0f, -6.0f}; + float ss[2]; + float sc; + cblas_crotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_csbmv.c b/utest/test_extensions/test_csbmv.c new file mode 100644 index 000000000..8e8ce4530 --- /dev/null +++ b/utest/test_extensions/test_csbmv.c @@ -0,0 +1,606 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CSBMV { + float sp_matrix[DATASIZE * (DATASIZE + 1)]; + float sb_matrix[DATASIZE * DATASIZE * 2]; + float b_test[DATASIZE * 2 * INCREMENT]; + float c_test[DATASIZE * 2 * INCREMENT]; + float c_verify[DATASIZE * 2 * INCREMENT]; +}; + +// SINGLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * FLT_EPSILON +// SINGLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 1.19e-07 = 5*e-03 +#define SINGLE_EPS_ZGEMV 5e-03 + +#ifdef BUILD_COMPLEX +static struct DATA_CSBMV data_csbmv; + +/** + * Transform full-storage symmetric band matrix A to upper (U) or lower (L) + * band-packed storage mode. + * + * param uplo specifies whether matrix a is upper or lower band-packed. + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * output param a - buffer for holding symmetric band-packed matrix + * param lda - specifies the leading dimension of a + * param sb_matrix - buffer holding full-storage symmetric band matrix A + * param ldm - specifies the leading dimension of A + */ +static void transform_to_band_storage(char uplo, blasint n, blasint k, float* a, blasint lda, + float* sb_matrix, blasint ldm) +{ + blasint i, j, m; + if (uplo == 'L') { + for (j = 0; j < n; j++) + { + m = -j; + for (i = 2 * j; i < MIN(2 * n, 2 * (j + k + 1)); i += 2) + { + a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2]; + a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2]; + } + } + } + else { + for (j = 0; j < n; j++) + { + m = k - j; + for (i = MAX(0, 2*(j - k)); i <= j*2; i += 2) + { + a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2]; + a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2]; + } + } + } +} + +/** + * Generate full-storage symmetric band matrix A with k - super-diagonals + * from input symmetric packed matrix in lower packed mode (L) + * + * output param sb_matrix - buffer for holding full-storage symmetric band matrix. + * param sp_matrix - buffer holding input symmetric packed matrix + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + */ +static void get_symmetric_band_matr(float *sb_matrix, float *sp_matrix, blasint n, blasint k) +{ + blasint m; + blasint i, j; + m = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < n * 2; j += 2) + { + // Make matrix band with k super-diagonals + if (fabs((i+1) - ceil((j+1)/2.0f)) > k) + { + sb_matrix[i * n * 2 + j] = 0.0f; + sb_matrix[i * n * 2 + j + 1] = 0.0f; + continue; + } + + if (j / 2 < i) + { + sb_matrix[i * n * 2 + j] = + sb_matrix[j * n + i * 2]; + sb_matrix[i * n * 2 + j + 1] = + sb_matrix[j * n + i * 2 + 1]; + } + else + { + sb_matrix[i * n * 2 + j] = sp_matrix[m++]; + sb_matrix[i * n * 2 + j + 1] = sp_matrix[m++]; + } + } + } +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether matrix a is upper or lower band-packed. + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b_test + * param inc_c - stride of vector c_test + * param expected_info - expected invalid parameter number in csbmv + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char uplo, blasint n, blasint k, blasint lda, blasint inc_b, + blasint inc_c, int expected_info) +{ + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float a[2]; + srand_generate(a, 2); + + set_xerbla("CSBMV ", expected_info); + + BLASFUNC(csbmv)(&uplo, &n, &k, alpha, a, &lda, data_csbmv.b_test, + &inc_b, beta, data_csbmv.c_test, &inc_c); + + return check_error(); +} + +/** + * Comapare results computed by csbmv and cgemv + * since csbmv is cgemv for symmetric band matrix + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * param alpha - scaling factor for the matrix-vector product + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b_test + * param beta - scaling factor for vector c_test + * param inc_c - stride of vector c_test + * param lda - specifies the leading dimension of a + * return norm of differences + */ +static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint lda, + blasint inc_b, float *beta, blasint inc_c, blasint ldm) +{ + blasint i; + + // Trans param for gemv (can use any, since the input matrix is symmetric) + char trans = 'N'; + + // Symmetric band packed matrix for sbmv + float a[lda * n * 2]; + + // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test + srand_generate(data_csbmv.sp_matrix, n * (n + 1)); + srand_generate(data_csbmv.b_test, n * inc_b * 2); + srand_generate(data_csbmv.c_test, n * inc_c * 2); + + // Copy vector c_test for cgemv + for (i = 0; i < n * inc_c * 2; i++) + data_csbmv.c_verify[i] = data_csbmv.c_test[i]; + + // Generate full-storage symmetric band matrix + // with k super-diagonals from symmetric packed matrix + get_symmetric_band_matr(data_csbmv.sb_matrix, data_csbmv.sp_matrix, n, k); + + // Transform symmetric band matrix from conventional + // full matrix storage to band storage for csbmv + transform_to_band_storage(uplo, n, k, a, lda, data_csbmv.sb_matrix, ldm); + + BLASFUNC(cgemv)(&trans, &n, &n, alpha, data_csbmv.sb_matrix, &ldm, data_csbmv.b_test, + &inc_b, beta, data_csbmv.c_verify, &inc_c); + + BLASFUNC(csbmv)(&uplo, &n, &k, alpha, a, &lda, + data_csbmv.b_test, &inc_b, beta, data_csbmv.c_test, &inc_c); + + // Find the differences between output vector caculated by csbmv and cgemv + for (i = 0; i < n * inc_c * 2; i++) + data_csbmv.c_test[i] -= data_csbmv.c_verify[i]; + + // Find the norm of differences + return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 0 + */ +CTEST(csbmv, upper_k_0_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 1 + */ +CTEST(csbmv, upper_k_1_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 1; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, upper_k_2_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, upper_k_2_inc_b_2_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 2 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, upper_k_2_inc_b_2_inc_c_2_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 2; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 0 + */ +CTEST(csbmv, lower_k_0_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 1 + */ +CTEST(csbmv, lower_k_1_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 1; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, lower_k_2_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, lower_k_2_inc_b_2_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test csbmv by comparing it against cgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 2 + * Number of super-diagonals k is 2 + */ +CTEST(csbmv, lower_k_2_inc_b_2_inc_c_2_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 2; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + float alpha[] = {2.0f, 1.0f}; + float beta[] = {2.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Check if output matrix a contains any NaNs + */ +CTEST(csbmv, check_for_NaN) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {1.0f, 1.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + + ASSERT_TRUE(norm == norm); /* NaN == NaN is false */ +} + +/** + * Test error function for an invalid param uplo. + * Uplo specifies whether a is in upper (U) or lower (L) band-packed storage mode. + */ +CTEST(csbmv, xerbla_uplo_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'O'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 1; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param N - + * number of rows and columns of A. Must be at least zero. + */ +CTEST(csbmv, xerbla_n_invalid) +{ + blasint n = INVALID, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 2; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Check if n - number of rows and columns of A equal zero. + */ +CTEST(csbmv, check_n_zero) +{ + blasint n = 0, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = 1; + char uplo = 'U'; + + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV); +} + +/** + * Test error function for an invalid param inc_b - + * stride of vector b_test. Can't be zero. + */ +CTEST(csbmv, xerbla_inc_b_zero) +{ + blasint n = 1, inc_b = 0, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 8; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_c - + * stride of vector c_test. Can't be zero. + */ +CTEST(csbmv, xerbla_inc_c_zero) +{ + blasint n = 1, inc_b = 1, inc_c = 0; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 11; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param k - + * number of super-diagonals of A. Must be at least zero. + */ +CTEST(csbmv, xerbla_k_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = INVALID; + blasint lda = 1; + int expected_info = 3; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda - + * specifies the leading dimension of a. Must be at least (k+1). + */ +CTEST(csbmv, xerbla_lda_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = INVALID; + int expected_info = 6; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_cscal.c b/utest/test_extensions/test_cscal.c new file mode 100644 index 000000000..009c600ad --- /dev/null +++ b/utest/test_extensions/test_cscal.c @@ -0,0 +1,164 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CSCAL { + float x_test[DATASIZE * 2 * INCREMENT]; + float x_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CSCAL data_cscal; + +/** + * cscal reference code + * + * param n - number of elements of vector x + * param alpha - scaling factor for the vector product + * param x - buffer holding input vector x + * param inc - stride of vector x + */ +static void cscal_trusted(blasint n, float *alpha, float* x, blasint inc){ + blasint i, ip = 0; + blasint inc_x2 = 2 * inc; + float temp; + for (i = 0; i < n; i++) + { + temp = alpha[0] * x[ip] - alpha[1] * x[ip+1]; + x[ip+1] = alpha[0] * x[ip+1] + alpha[1] * x[ip]; + x[ip] = temp; + ip += inc_x2; + } +} + +/** + * Comapare results computed by cscal and cscal_trusted + * + * param api specifies tested api (C or Fortran) + * param n - number of elements of vector x + * param alpha - scaling factor for the vector product + * param inc - stride of vector x + * return norm of differences + */ +static float check_cscal(char api, blasint n, float *alpha, blasint inc) +{ + blasint i; + + // Fill vectors a + srand_generate(data_cscal.x_test, n * inc * 2); + + // Copy vector x for cscal_trusted + for (i = 0; i < n * 2 * inc; i++) + data_cscal.x_verify[i] = data_cscal.x_test[i]; + + cscal_trusted(n, alpha, data_cscal.x_verify, inc); + + if(api == 'F') + BLASFUNC(cscal)(&n, alpha, data_cscal.x_test, &inc); + else + cblas_cscal(n, alpha, data_cscal.x_test, inc); + + // Find the differences between output vector computed by cscal and cscal_trusted + for (i = 0; i < n * 2 * inc; i++) + data_cscal.x_verify[i] -= data_cscal.x_test[i]; + + // Find the norm of differences + return BLASFUNC(scnrm2)(&n, data_cscal.x_verify, &inc); +} + +/** + * Fortran API specific test + * Test cscal by comparing it against reference + */ +CTEST(cscal, alpha_r_zero_alpha_i_not_zero) +{ + blasint N = DATASIZE; + blasint inc = 1; + float alpha[2] = {0.0f, 1.0f}; + + float norm = check_cscal('F', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test cscal by comparing it against reference + */ +CTEST(cscal, alpha_r_zero_alpha_i_zero_inc_2) +{ + blasint N = DATASIZE; + blasint inc = 2; + float alpha[2] = {0.0f, 0.0f}; + + float norm = check_cscal('F', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cscal by comparing it against reference + */ +CTEST(cscal, c_api_alpha_r_zero_alpha_i_not_zero) +{ + blasint N = DATASIZE; + blasint inc = 1; + float alpha[2] = {0.0f, 1.0f}; + + float norm = check_cscal('C', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test cscal by comparing it against reference + */ +CTEST(cscal, c_api_alpha_r_zero_alpha_i_zero_inc_2) +{ + blasint N = DATASIZE; + blasint inc = 2; + float alpha[2] = {0.0f, 0.0f}; + + float norm = check_cscal('C', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_cspmv.c b/utest/test_extensions/test_cspmv.c new file mode 100644 index 000000000..b64c90e3a --- /dev/null +++ b/utest/test_extensions/test_cspmv.c @@ -0,0 +1,428 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_CSPMV { + float a_verify[DATASIZE * DATASIZE * 2]; + float a_test[DATASIZE * (DATASIZE + 1)]; + float b_test[DATASIZE * 2 * INCREMENT]; + float c_test[DATASIZE * 2 * INCREMENT]; + float c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CSPMV data_cspmv; + +/** + * Compute spmv via gemv since spmv is gemv for symmetric packed matrix + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param alpha - scaling factor for the matrix-vector product + * param a - buffer holding input matrix A + * param b - Buffer holding input vector b + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param c - buffer holding input/output vector c + * param inc_c - stride of vector c + * output param data_cspmv.c_verify - matrix computed by gemv + */ +static void cspmv_trusted(char uplo, blasint n, float *alpha, float *a, + float *b, blasint inc_b, float *beta, float *c, + blasint inc_c) +{ + blasint k; + blasint i, j; + + // param for gemv (can use any, since the input matrix is symmetric) + char trans = 'N'; + + // Unpack the input symmetric packed matrix + if (uplo == 'L') + { + k = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < n * 2; j += 2) + { + if (j / 2 < i) + { + data_cspmv.a_verify[i * n * 2 + j] = + data_cspmv.a_verify[j * n + i * 2]; + data_cspmv.a_verify[i * n * 2 + j + 1] = + data_cspmv.a_verify[j * n + i * 2 + 1]; + } + else + { + data_cspmv.a_verify[i * n * 2 + j] = a[k++]; + data_cspmv.a_verify[i * n * 2 + j + 1] = a[k++]; + } + } + } + } + else + { + k = n * (n + 1) - 1; + for (j = 2 * n - 1; j >= 0; j -= 2) + { + for (i = n - 1; i >= 0; i--) + { + if (j / 2 < i) + { + data_cspmv.a_verify[i * n * 2 + j] = + data_cspmv.a_verify[(j - 1) * n + i * 2 + 1]; + data_cspmv.a_verify[i * n * 2 + j - 1] = + data_cspmv.a_verify[(j - 1) * n + i * 2]; + } + else + { + data_cspmv.a_verify[i * n * 2 + j] = a[k--]; + data_cspmv.a_verify[i * n * 2 + j - 1] = a[k--]; + } + } + } + } + + // Run gemv with the unpacked matrix + BLASFUNC(cgemv)(&trans, &n, &n, alpha, data_cspmv.a_verify, &n, b, + &inc_b, beta, data_cspmv.c_verify, &inc_c); +} + +/** + * Comapare results computed by cspmv and cspmv_trusted + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param alpha - scaling factor for the matrix-vector product + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static float check_cspmv(char uplo, blasint n, float *alpha, blasint inc_b, + float *beta, blasint inc_c) +{ + blasint i; + + // Fill symmetric packed maxtix a, vectors b and c + srand_generate(data_cspmv.a_test, n * (n + 1)); + srand_generate(data_cspmv.b_test, 2 * n * inc_b); + srand_generate(data_cspmv.c_test, 2 * n * inc_c); + + // Copy vector c for cspmv_trusted + for (i = 0; i < n * 2 * inc_c; i++) + data_cspmv.c_verify[i] = data_cspmv.c_test[i]; + + cspmv_trusted(uplo, n, alpha, data_cspmv.a_test, data_cspmv.b_test, + inc_b, beta, data_cspmv.c_verify, inc_c); + + BLASFUNC(cspmv)(&uplo, &n, alpha, data_cspmv.a_test, data_cspmv.b_test, + &inc_b, beta, data_cspmv.c_test, &inc_c); + + // Find the differences between output vector computed by cspmv and cspmv_trusted + for (i = 0; i < n * 2 * inc_c; i++) + data_cspmv.c_test[i] -= data_cspmv.c_verify[i]; + + // Find the norm of differences + return BLASFUNC(scnrm2)(&n, data_cspmv.c_test, &inc_c); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param inc_b - stride of vector b + * param inc_c - stride of vector c + * param expected_info - expected invalid parameter number in cspmv + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char uplo, blasint n, blasint inc_b, + blasint inc_c, int expected_info) +{ + float alpha[] = {1.0, 1.0}; + float beta[] = {0.0, 0.0}; + + set_xerbla("CSPMV ", expected_info); + + BLASFUNC(cspmv)(&uplo, &n, alpha, data_cspmv.a_test, data_cspmv.b_test, + &inc_b, beta, data_cspmv.c_test, &inc_c); + + return check_error(); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cspmv, upper_inc_b_1_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 2 + */ +CTEST(cspmv, upper_inc_b_1_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 2; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 1 + */ +CTEST(cspmv, upper_inc_b_2_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 1; + char uplo = 'U'; + float alpha[] = {1.0f, 0.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(cspmv, upper_inc_b_2_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 2; + char uplo = 'U'; + float alpha[] = {2.5, -2.1}; + float beta[] = {0.0f, 1.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(cspmv, lower_inc_b_1_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 2 + */ +CTEST(cspmv, lower_inc_b_1_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 2; + char uplo = 'L'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 1 + */ +CTEST(cspmv, lower_inc_b_2_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 1; + char uplo = 'L'; + float alpha[] = {1.0f, 0.0f}; + float beta[] = {1.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Test cspmv by comparing it against cgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(cspmv, lower_inc_b_2_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 2; + char uplo = 'L'; + float alpha[] = {2.5, -2.1}; + float beta[] = {0.0f, 1.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL); +} + +/** + * Check if output matrix A contains any NaNs + */ +CTEST(cspmv, check_for_NaN) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'U'; + float alpha[] = {1.0f, 1.0f}; + float beta[] = {0.0f, 0.0f}; + + float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_TRUE(norm == norm); /* NaN == NaN is false */ +} + +/** + * Test error function for an invalid param uplo. + * uplo specifies whether A is upper or lower triangular. + */ +CTEST(cspmv, xerbla_uplo_invalid) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param N - + * number of rows and columns of A. Must be at least zero. + */ +CTEST(cspmv, xerbla_N_invalid) +{ + blasint N = INVALID, inc_b = 1, inc_c = 1; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_b - + * stride of vector b. Can't be zero. + */ +CTEST(cspmv, xerbla_inc_b_zero) +{ + blasint N = DATASIZE, inc_b = 0, inc_c = 1; + char uplo = 'U'; + int expected_info = 6; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_c - + * stride of vector c. Can't be zero. + */ +CTEST(cspmv, xerbla_inc_c_zero) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 0; + char uplo = 'U'; + int expected_info = 9; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_ctrmv.c b/utest/test_extensions/test_ctrmv.c new file mode 100644 index 000000000..2a3f27416 --- /dev/null +++ b/utest/test_extensions/test_ctrmv.c @@ -0,0 +1,266 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 300 +#define INCREMENT 2 + +struct DATA_CTRMV { + float a_test[DATASIZE * DATASIZE * 2]; + float a_verify[DATASIZE * DATASIZE * 2]; + float x_test[DATASIZE * INCREMENT * 2]; + float x_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CTRMV data_ctrmv; + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * + * param uplo specifies whether A is upper or lower triangular + * param trans specifies op(A), the transposition (conjugation) operation applied to A + * param diag specifies whether the matrix A is unit triangular or not. + * param n - numbers of rows and columns of A + * param lda - leading dimension of matrix A + * param incx - increment for the elements of x + * return norm of difference + */ +static float check_ctrmv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx) +{ + blasint i; + float alpha_conj[] = {1.0f, 0.0f}; + char trans_verify = trans; + + srand_generate(data_ctrmv.a_test, n * lda * 2); + srand_generate(data_ctrmv.x_test, n * incx * 2); + + for (i = 0; i < n * lda * 2; i++) + data_ctrmv.a_verify[i] = data_ctrmv.a_test[i]; + + for (i = 0; i < n * incx * 2; i++) + data_ctrmv.x_verify[i] = data_ctrmv.x_test[i]; + + if (trans == 'R'){ + cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ctrmv.a_verify, lda, lda); + trans_verify = 'N'; + } + + BLASFUNC(ctrmv)(&uplo, &trans_verify, &diag, &n, data_ctrmv.a_verify, &lda, + data_ctrmv.x_verify, &incx); + + BLASFUNC(ctrmv)(&uplo, &trans, &diag, &n, data_ctrmv.a_test, &lda, + data_ctrmv.x_test, &incx); + + for (i = 0; i < n * incx * 2; i++) + data_ctrmv.x_verify[i] -= data_ctrmv.x_test[i]; + + return BLASFUNC(scnrm2)(&n, data_ctrmv.x_verify, &incx); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + */ +CTEST(ctrmv, conj_notrans_upper_not_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + */ +CTEST(ctrmv, conj_notrans_upper_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + */ +CTEST(ctrmv, conj_notrans_lower_not_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + */ +CTEST(ctrmv, conj_notrans_lower_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ctrmv, conj_notrans_upper_not_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ctrmv, conj_notrans_upper_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ctrmv, conj_notrans_lower_not_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ctrmv, conj_notrans_lower_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_ctrsv.c b/utest/test_extensions/test_ctrsv.c new file mode 100644 index 000000000..0e639bb2a --- /dev/null +++ b/utest/test_extensions/test_ctrsv.c @@ -0,0 +1,267 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 300 +#define INCREMENT 2 + +struct DATA_CTRSV { + float a_test[DATASIZE * DATASIZE * 2]; + float a_verify[DATASIZE * DATASIZE * 2]; + float x_test[DATASIZE * INCREMENT * 2]; + float x_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX +static struct DATA_CTRSV data_ctrsv; + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * + * param uplo specifies whether A is upper or lower triangular + * param trans specifies op(A), the transposition (conjugation) operation applied to A + * param diag specifies whether the matrix A is unit triangular or not. + * param n - numbers of rows and columns of A + * param lda - leading dimension of matrix A + * param incx - increment for the elements of x + * return norm of difference + */ +static float check_ctrsv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx) +{ + blasint i; + float alpha_conj[] = {1.0f, 0.0f}; + char trans_verify = trans; + + srand_generate(data_ctrsv.a_test, n * lda * 2); + srand_generate(data_ctrsv.x_test, n * incx * 2); + + for (i = 0; i < n * lda * 2; i++) + data_ctrsv.a_verify[i] = data_ctrsv.a_test[i]; + + for (i = 0; i < n * incx * 2; i++) + data_ctrsv.x_verify[i] = data_ctrsv.x_test[i]; + + if (trans == 'R'){ + cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, + alpha_conj, data_ctrsv.a_verify, lda, lda); + trans_verify = 'N'; + } + + BLASFUNC(ctrsv)(&uplo, &trans_verify, &diag, &n, data_ctrsv.a_verify, + &lda, data_ctrsv.x_verify, &incx); + + BLASFUNC(ctrsv)(&uplo, &trans, &diag, &n, data_ctrsv.a_test, &lda, + data_ctrsv.x_test, &incx); + + for (i = 0; i < n * incx * 2; i++) + data_ctrsv.x_verify[i] -= data_ctrsv.x_test[i]; + + return BLASFUNC(scnrm2)(&n, data_ctrsv.x_verify, &incx); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + */ +CTEST(ctrsv, conj_notrans_upper_not_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + */ +CTEST(ctrsv, conj_notrans_upper_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + */ +CTEST(ctrsv, conj_notrans_lower_not_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + */ +CTEST(ctrsv, conj_notrans_lower_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ctrsv, conj_notrans_upper_not_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ctrsv, conj_notrans_upper_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ctrsv, conj_notrans_lower_not_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} + +/** + * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ctrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ctrsv, conj_notrans_lower_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + float norm = check_ctrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_damin.c b/utest/test_extensions/test_damin.c new file mode 100644 index 000000000..d492343ed --- /dev/null +++ b/utest/test_extensions/test_damin.c @@ -0,0 +1,354 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_DOUBLE + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.1}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.1, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, 1.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.1, 1.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.1, 1.0, -2.2}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0}; + + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0; + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 1.0; + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} + +/** + * Test damin by comparing it against pre-calculated values + */ +CTEST(damin, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + double amin = BLASFUNC(damin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_daxpby.c b/utest/test_extensions/test_daxpby.c new file mode 100644 index 000000000..6e77c7c7c --- /dev/null +++ b/utest/test_extensions/test_daxpby.c @@ -0,0 +1,799 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_DAXPBY{ + double x_test[DATASIZE * INCREMENT]; + double x_verify[DATASIZE * INCREMENT]; + double y_test[DATASIZE * INCREMENT]; + double y_verify[DATASIZE * INCREMENT]; +}; + +#ifdef BUILD_DOUBLE +static struct DATA_DAXPBY data_daxpby; + +/** + * Fortran API specific function + * Test daxpby by comparing it with dscal and daxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static double check_daxpby(blasint n, double alpha, blasint incx, double beta, blasint incy) +{ + blasint i; + + // dscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + drand_generate(data_daxpby.x_test, n * incx_abs); + drand_generate(data_daxpby.y_test, n * incy_abs); + + // Copy vector x for daxpy + for (i = 0; i < n * incx_abs; i++) + data_daxpby.x_verify[i] = data_daxpby.x_test[i]; + + // Copy vector y for dscal + for (i = 0; i < n * incy_abs; i++) + data_daxpby.y_verify[i] = data_daxpby.y_test[i]; + + // Find beta*y + BLASFUNC(dscal)(&n, &beta, data_daxpby.y_verify, &incy_abs); + + // Find sum of alpha*x and beta*y + BLASFUNC(daxpy)(&n, &alpha, data_daxpby.x_verify, &incx, + data_daxpby.y_verify, &incy); + + BLASFUNC(daxpby)(&n, &alpha, data_daxpby.x_test, &incx, + &beta, data_daxpby.y_test, &incy); + + // Find the differences between output vector caculated by daxpby and daxpy + for (i = 0; i < n * incy_abs; i++) + data_daxpby.y_test[i] -= data_daxpby.y_verify[i]; + + // Find the norm of differences + return BLASFUNC(dnrm2)(&n, data_daxpby.y_test, &incy_abs); +} + +/** + * C API specific function + * Test daxpby by comparing it with dscal and daxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static double c_api_check_daxpby(blasint n, double alpha, blasint incx, double beta, blasint incy) +{ + blasint i; + + // dscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Copy vector x for daxpy + for (i = 0; i < n * incx_abs; i++) + data_daxpby.x_verify[i] = data_daxpby.x_test[i]; + + // Copy vector y for dscal + for (i = 0; i < n * incy_abs; i++) + data_daxpby.y_verify[i] = data_daxpby.y_test[i]; + + // Find beta*y + cblas_dscal(n, beta, data_daxpby.y_verify, incy_abs); + + // Find sum of alpha*x and beta*y + cblas_daxpy(n, alpha, data_daxpby.x_verify, incx, + data_daxpby.y_verify, incy); + + cblas_daxpby(n, alpha, data_daxpby.x_test, incx, + beta, data_daxpby.y_test, incy); + + // Find the differences between output vector caculated by daxpby and daxpy + for (i = 0; i < n * incy_abs; i++) + data_daxpby.y_test[i] -= data_daxpby.y_verify[i]; + + // Find the norm of differences + return cblas_dnrm2(n, data_daxpby.y_test, incy_abs); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(daxpby, inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 1.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(daxpby, inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(daxpby, inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(daxpby, inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha = 3.0; + double beta = 4.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(daxpby, inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + double alpha = 5.0; + double beta = 4.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(daxpby, inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + double alpha = 1.0; + double beta = 6.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(daxpby, inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + double alpha = 7.0; + double beta = 3.5; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(daxpby, inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 0.0; + double beta = 1.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero +*/ +CTEST(daxpby, inc_x_1_inc_y_2_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 0.0; + double beta = 1.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(daxpby, inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * Scalar beta is zero +*/ +CTEST(daxpby, inc_x_2_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(daxpby, inc_x_1_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(daxpby, inc_x_2_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(daxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 0.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(daxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 0.0; + double beta = 0.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(daxpby, check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 1.0; + + double norm = check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 1.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(daxpby, c_api_inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha = 2.0; + double beta = 1.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 1.0; + double beta = 2.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(daxpby, c_api_inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha = 3.0; + double beta = 4.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(daxpby, c_api_inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + double alpha = 5.0; + double beta = 4.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(daxpby, c_api_inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + double alpha = 1.0; + double beta = 6.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(daxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + double alpha = 7.0; + double beta = 3.5; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 0.0; + double beta = 1.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero +*/ +CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 0.0; + double beta = 1.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * Scalar beta is zero +*/ +CTEST(daxpby, c_api_inc_x_2_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha = 1.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 1.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(daxpby, c_api_inc_x_2_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha = 1.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha = 0.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test daxpby by comparing it with dscal and daxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha = 0.0; + double beta = 0.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(daxpby, c_api_check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + double alpha = 1.0; + double beta = 1.0; + + double norm = c_api_check_daxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dgeadd.c b/utest/test_extensions/test_dgeadd.c new file mode 100644 index 000000000..4654c51a3 --- /dev/null +++ b/utest/test_extensions/test_dgeadd.c @@ -0,0 +1,878 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 + +struct DATA_DGEADD{ + double a_test[M * N]; + double c_test[M * N]; + double c_verify[M * N]; +}; + +#ifdef BUILD_DOUBLE +static struct DATA_DGEADD data_dgeadd; + +/** + * dgeadd reference implementation + * + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param aptr - refer to matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param cptr - refer to matrix C + * param ldc - leading dimension of C + */ +static void dgeadd_trusted(blasint m, blasint n, double alpha, double *aptr, + blasint lda, double beta, double *cptr, blasint ldc) +{ + blasint i; + + for (i = 0; i < n; i++) + { + cblas_daxpby(m, alpha, aptr, 1, beta, cptr, 1); + aptr += lda; + cptr += ldc; + } +} + +/** + * Test dgeadd by comparing it against reference + * Compare with the following options: + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static double check_dgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, double alpha, blasint lda, + double beta, blasint ldc) +{ + blasint i; + blasint cols = m, rows = n; + + if (order == CblasRowMajor) + { + rows = m; + cols = n; + } + + // Fill matrix A, C + drand_generate(data_dgeadd.a_test, lda * rows); + drand_generate(data_dgeadd.c_test, ldc * rows); + + // Copy matrix C for dgeadd + for (i = 0; i < ldc * rows; i++) + data_dgeadd.c_verify[i] = data_dgeadd.c_test[i]; + + dgeadd_trusted(cols, rows, alpha, data_dgeadd.a_test, lda, + beta, data_dgeadd.c_verify, ldc); + + if (api == 'F') + BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda, + &beta, data_dgeadd.c_test, &ldc); + else + cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda, + beta, data_dgeadd.c_test, ldc); + + // Find the differences between output matrix caculated by dgeadd and sgemm + return dmatrix_difference(data_dgeadd.c_test, data_dgeadd.c_verify, cols, rows, ldc); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param lda - leading dimension of A + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in dgeadd + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, blasint lda, + blasint ldc, int expected_info) +{ + double alpha = 1.0; + double beta = 1.0; + + set_xerbla("DGEADD ", expected_info); + + if (api == 'F') + BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda, + &beta, data_dgeadd.c_test, &ldc); + else + cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda, + beta, data_dgeadd.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(dgeadd, matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 3.0; + double beta = 3.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(dgeadd, matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 0.0; + double beta = 2.5; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(dgeadd, matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 3.0; + double beta = 0.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(dgeadd, matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 0.0; + double beta = 0.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(dgeadd, matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + double alpha = 1.0; + double beta = 1.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n - + * number of columns of A and C + * Must be at least zero. + */ +CTEST(dgeadd, xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = m; + blasint ldc = m; + + int expected_info = 2; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + */ +CTEST(dgeadd, xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + */ +CTEST(dgeadd, xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 6; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + */ +CTEST(dgeadd, xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Check if n - number of columns of A, C equal zero. + */ +CTEST(dgeadd, n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + double alpha = 1.0; + double beta = 1.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Check if m - number of rows of A and C equal zero. + */ +CTEST(dgeadd, m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + double alpha = 1.0; + double beta = 1.0; + + double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(dgeadd, c_api_matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 2.0; + double beta = 3.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(dgeadd, c_api_matrix_n_100_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 4.0; + double beta = 2.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(dgeadd, c_api_matrix_n_50_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N / 2; + blasint m = M; + + blasint lda = n; + blasint ldc = n; + + double alpha = 3.0; + double beta = 1.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(dgeadd, c_api_matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 0.0; + double beta = 1.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(dgeadd, c_api_matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 3.0; + double beta = 0.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(dgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha = 0.0; + double beta = 0.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(dgeadd, c_api_matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + double alpha = 3.0; + double beta = 4.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test error function for an invalid param order - + * specifies whether A and C stored in + * row-major order or column-major order + */ +CTEST(dgeadd, c_api_xerbla_invalid_order) +{ + CBLAS_ORDER order = INVALID; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 0; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(dgeadd, c_api_xerbla_n_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(dgeadd, c_api_xerbla_m_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(dgeadd, c_api_xerbla_lda_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(dgeadd, c_api_xerbla_ldc_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Check if n - number of columns of A, C equal zero. + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + double alpha = 1.0; + double beta = 1.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Check if m - number of rows of A and C equal zero. + * + * c api option order is column-major order + */ +CTEST(dgeadd, c_api_m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + double alpha = 1.0; + double beta = 1.0; + + double norm = check_dgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dgemmt.c b/utest/test_extensions/test_dgemmt.c new file mode 100644 index 000000000..22dcaf2aa --- /dev/null +++ b/utest/test_extensions/test_dgemmt.c @@ -0,0 +1,1442 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_DGEMMT { + double a_test[DATASIZE * DATASIZE]; + double b_test[DATASIZE * DATASIZE]; + double c_test[DATASIZE * DATASIZE]; + double c_verify[DATASIZE * DATASIZE]; + double c_gemm[DATASIZE * DATASIZE]; +}; + +#ifdef BUILD_DOUBLE +static struct DATA_DGEMMT data_dgemmt; + +/** + * Compute gemmt via gemm since gemmt is gemm but updates only + * the upper or lower triangular part of the result matrix + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + */ +static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, double alpha, blasint lda, + blasint ldb, double beta, blasint ldc) +{ + blasint i, j; + + if(api == 'F') + BLASFUNC(dgemm)(&transa, &transb, &m, &m, &k, &alpha, data_dgemmt.a_test, &lda, + data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_gemm, &ldc); + else + cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda, + data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc); + + if (uplo == 'L' || uplo == CblasLower) + { + for (i = 0; i < m; i++) + for (j = i; j < m; j++) + data_dgemmt.c_verify[i * ldc + j] = + data_dgemmt.c_gemm[i * ldc + j]; + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i; j++) + data_dgemmt.c_verify[i * ldc + j] = + data_dgemmt.c_gemm[i * ldc + j]; + } +} + +/** + * Comapare results computed by dgemmt and dgemmt_trusted + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static double check_dgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, double alpha, blasint lda, + blasint ldb, double beta, blasint ldc) +{ + blasint i; + blasint b_cols; + blasint a_cols; + blasint inc = 1; + blasint size_c = m * ldc; + + if(order == CblasColMajor){ + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = m; + else a_cols = k; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = k; + else b_cols = m; + } else { + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = k; + else a_cols = m; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = m; + else b_cols = k; + } + + drand_generate(data_dgemmt.a_test, a_cols * lda); + drand_generate(data_dgemmt.b_test, b_cols * ldb); + drand_generate(data_dgemmt.c_test, m * ldc); + + for (i = 0; i < m * ldc; i++) + data_dgemmt.c_gemm[i] = data_dgemmt.c_verify[i] = data_dgemmt.c_test[i]; + + dgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc); + + if (api == 'F') + BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test, + &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc); + else + cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda, + data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc); + + for (i = 0; i < m * ldc; i++) + data_dgemmt.c_verify[i] -= data_dgemmt.c_test[i]; + + return BLASFUNC(dnrm2)(&size_c, data_dgemmt.c_verify, &inc) / size_c; +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in dgemmt + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, blasint lda, blasint ldb, + blasint ldc, int expected_info) +{ + double alpha = 1.0; + double beta = 0.0; + + set_xerbla("DGEMMT ", expected_info); + + if (api == 'F') + BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test, + &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc); + else + cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda, + data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'T'; + char uplo = 'U'; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'U'; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(dgemmt, upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + char transa = 'T', transb = 'N'; + char uplo = 'L'; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'T'; + char uplo = 'L'; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'L'; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(dgemmt, lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, c_api_colmajor_upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, c_api_colmajor_upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, c_api_colmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(dgemmt, c_api_colmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, c_api_colmajor_lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, c_api_colmajor_lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, c_api_colmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(dgemmt, c_api_colmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, c_api_rowmajor_upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 100; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 100, ldb = 100, ldc = 50; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, c_api_rowmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(dgemmt, c_api_rowmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(dgemmt, c_api_rowmajor_lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 100; + double alpha = 1.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 100, ldb = 100, ldc = 50; + double alpha = 1.0; + double beta = 0.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 1.5; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(dgemmt, c_api_rowmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 0.0; + double beta = 2.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dgemmt by comparing it against dgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(dgemmt, c_api_rowmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha = 2.0; + double beta = 1.0; + + double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param uplo. + * Must be upper (U) or lower (L). + */ +CTEST(dgemmt, xerbla_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transa. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(dgemmt, xerbla_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'O', transb = 'N'; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transb. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(dgemmt, xerbla_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'O'; + char uplo = 'U'; + int expected_info = 3; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(dgemmt, xerbla_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 4; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(dgemmt, xerbla_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 5; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(dgemmt, xerbla_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 8; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(dgemmt, xerbla_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 10; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(dgemmt, xerbla_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 13; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. + * Test error function for an invalid param order. + * Must be column or row major. + */ +CTEST(dgemmt, xerbla_c_api_major_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 0; + + int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(dgemmt, xerbla_c_api_colmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B transposed. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(dgemmt, xerbla_c_api_rowmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c new file mode 100644 index 000000000..4debb50e8 --- /dev/null +++ b/utest/test_extensions/test_dimatcopy.c @@ -0,0 +1,947 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_DIMATCOPY { + double a_test[DATASIZE* DATASIZE]; + double a_verify[DATASIZE* DATASIZE]; +}; + +#ifdef BUILD_DOUBLE +static struct DATA_DIMATCOPY data_dimatcopy; + +/** + * Comapare results computed by dimatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * return norm of difference between openblas and reference func + */ +static double check_dimatcopy(char api, char order, char trans, blasint rows, blasint cols, double alpha, + blasint lda_src, blasint lda_dst) +{ + blasint m, n; + blasint rows_out, cols_out; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + if (order == 'C') { + n = rows; m = cols; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + rows_out = n; cols_out = m; + } + else { + rows_out = m; cols_out = n; + } + + drand_generate(data_dimatcopy.a_test, lda_src*m); + + if (trans == 'T' || trans == 'C') { + dtranspose(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); + } + else { + dcopy(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); + } + + if (api == 'F') { + BLASFUNC(dimatcopy)(&order, &trans, &rows, &cols, &alpha, data_dimatcopy.a_test, + &lda_src, &lda_dst); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_dimatcopy(corder, ctrans, rows, cols, alpha, data_dimatcopy.a_test, + lda_src, lda_dst); + } + + // Find the differences between output matrix computed by dimatcopy and reference func + return dmatrix_difference(data_dimatcopy.a_test, data_dimatcopy.a_verify, cols_out, rows_out, lda_dst); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda_src, blasint lda_dst, int expected_info) +{ + double alpha = 1.0; + + set_xerbla("DIMATCOPY", expected_info); + + BLASFUNC(dimatcopy)(&order, &trans, &rows, &cols, &alpha, data_dimatcopy.a_test, + &lda_src, &lda_dst); + + return check_error(); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, colmajor_trans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, colmajor_trans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, colmajor_trans_col_50_row_100_alpha_one) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_50_row_100_alpha_one) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, colmajor_trans_col_50_row_100_alpha_zero) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_50_row_100_alpha_zero) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_50_alpha_one) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 1.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_50_alpha_one) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_50_alpha_zero) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_50_alpha_zero) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, rowmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 2.0 + */ +CTEST(dimatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(dimatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(dimatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param m. + * Must be positive. + */ +CTEST(dimatcopy, xerbla_invalid_rows) +{ + blasint m = 0, n = 100; + blasint lda_src = 0, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param n. + * Must be positive. + */ +CTEST(dimatcopy, xerbla_invalid_cols) +{ + blasint m = 100, n = 0; + blasint lda_src = 100, lda_dst = 0; + char order = 'C'; + char trans = 'T'; + int expected_info = 4; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using row major layout, + * lda_src must be at least n. + */ +CTEST(dimatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using column major layout, + * lda_src must be at least m. + */ +CTEST(dimatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is no transposition, lda_dst must be at least n. + */ +CTEST(dimatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is transposition, lda_dst must be at least m. + */ +CTEST(dimatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is no transposition, lda_dst must be at least m. + */ +CTEST(dimatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is transposition, lda_dst must be at least n. + */ +CTEST(dimatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_domatcopy.c b/utest/test_extensions/test_domatcopy.c new file mode 100644 index 000000000..f692e8784 --- /dev/null +++ b/utest/test_extensions/test_domatcopy.c @@ -0,0 +1,672 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_DOMATCOPY { + double a_test[DATASIZE * DATASIZE]; + double b_test[DATASIZE * DATASIZE]; + double b_verify[DATASIZE * DATASIZE]; +}; + +#ifdef BUILD_DOUBLE +static struct DATA_DOMATCOPY data_domatcopy; + +/** + * Comapare results computed by domatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param alpha - scaling factor for matrix B + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * return norm of difference between openblas and reference func + */ +static double check_domatcopy(char api, char order, char trans, blasint rows, blasint cols, double alpha, + blasint lda, blasint ldb) +{ + blasint b_rows, b_cols; + blasint m, n; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + if (order == 'C') { + m = cols; n = rows; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + b_rows = n; b_cols = m; + } + else { + b_rows = m; b_cols = n; + } + + drand_generate(data_domatcopy.a_test, lda*m); + + if (trans == 'T' || trans == 'C') { + dtranspose(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); + } + else { + dcopy(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); + } + + if (api == 'F') { + BLASFUNC(domatcopy)(&order, &trans, &rows, &cols, &alpha, data_domatcopy.a_test, + &lda, data_domatcopy.b_test, &ldb); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_domatcopy(corder, ctrans, rows, cols, alpha, data_domatcopy.a_test, + lda, data_domatcopy.b_test, ldb); + } + + return dmatrix_difference(data_domatcopy.b_test, data_domatcopy.b_verify, b_cols, b_rows, ldb); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda, blasint ldb, int expected_info) +{ + double alpha = 1.0; + + set_xerbla("DOMATCOPY", expected_info); + + BLASFUNC(domatcopy)(&order, &trans, &rows, &cols, &alpha, data_domatcopy.a_test, + &lda, data_domatcopy.b_test, &ldb); + + return check_error(); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(domatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + double alpha = 2.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific tests + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(domatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(domatcopy, colmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 0.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(domatcopy, colmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 50; + char order = 'C'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(domatcopy, rowmajor_conjtrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + double alpha = 2.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(domatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + double alpha = 2.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Matrix dimensions leave residues from 4 and 2 (specialize + * for rt case) + * alpha = 1.5 + */ +CTEST(domatcopy, rowmajor_trans_col_27_row_27) +{ + blasint m = 27, n = 27; + blasint lda = 27, ldb = 27; + char order = 'R'; + char trans = 'T'; + double alpha = 1.5; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(domatcopy, rowmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 0.0; + + double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'T'; + double alpha = 1.0; + + double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test domatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(domatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + double alpha = 1.0; + + double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(domatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(domatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param m. + * Must be positive. + */ +CTEST(domatcopy, xerbla_invalid_rows) +{ + blasint m = 0, n = 100; + blasint lda = 0, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param n. + * Must be positive. + */ +CTEST(domatcopy, xerbla_invalid_cols) +{ + blasint m = 100, n = 0; + blasint lda = 100, ldb = 0; + char order = 'C'; + char trans = 'T'; + int expected_info = 4; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using row major layout, + * lda must be at least n. + */ +CTEST(domatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using column major layout, + * lda must be at least m. + */ +CTEST(domatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is no transposition, ldb must be at least n. + */ +CTEST(domatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is transposition, ldb must be at least m. + */ +CTEST(domatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is no transposition, ldb must be at least m. + */ +CTEST(domatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is transposition, ldb must be at least n. + */ +CTEST(domatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_drotmg.c b/utest/test_extensions/test_drotmg.c new file mode 100644 index 000000000..3073c8e3e --- /dev/null +++ b/utest/test_extensions/test_drotmg.c @@ -0,0 +1,414 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#ifdef BUILD_DOUBLE + +/** + * Fortran API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, y1_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 0.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0; + tr_d2 = 2.0; + tr_x1 = 8.0; + tr_y1 = 0.0; + + tr_param[0] = -2.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * Fortran API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, d1_negative) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = -1.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0; + tr_d2 = 0.0; + tr_x1 = 0.0; + tr_y1 = 8.0; + + tr_param[0] = -1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * Fortran API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, d1_positive_d2_positive_x1_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 0.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0; + tr_d2 = 2.0; + tr_x1 = 8.0; + tr_y1 = 8.0; + + tr_param[0] = 1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * Fortran API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, scaled_y_greater_than_scaled_x) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 1.0; + te_d2 = tr_d2 = -2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0; + tr_d2 = 0.0; + tr_x1 = 0.0; + tr_y1 = 8.0; + + tr_param[0] = -1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * C API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, c_api_y1_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 0.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0; + tr_d2 = 2.0; + tr_x1 = 8.0; + tr_y1 = 0.0; + + tr_param[0] = -2.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * C API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, c_api_d1_negative) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = -1.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0; + tr_d2 = 0.0; + tr_x1 = 0.0; + tr_y1 = 8.0; + + tr_param[0] = -1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * C API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, c_api_d1_positive_d2_positive_x1_zero) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0; + te_d2 = tr_d2 = 2.0; + te_x1 = tr_x1 = 0.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0; + tr_d2 = 2.0; + tr_x1 = 8.0; + tr_y1 = 8.0; + + tr_param[0] = 1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} + +/** + * C API specific test + * Test drotmg by comparing it against pre-calculated values + */ +CTEST(drotmg, c_api_scaled_y_greater_than_scaled_x) +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 1.0; + te_d2 = tr_d2 = -2.0; + te_x1 = tr_x1 = 8.0; + te_y1 = tr_y1 = 8.0; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0; + tr_d2 = 0.0; + tr_x1 = 0.0; + tr_y1 = 8.0; + + tr_param[0] = -1.0; + tr_param[1] = 0.0; + tr_param[2] = 0.0; + tr_param[3] = 0.0; + tr_param[4] = 0.0; + + //OpenBLAS + cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); + } +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dsum.c b/utest/test_extensions/test_dsum.c new file mode 100644 index 000000000..e987c5a42 --- /dev/null +++ b/utest/test_extensions/test_dsum.c @@ -0,0 +1,403 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_DOUBLE + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, -1.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, -1.5, 1.0, 1.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.3, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, -1.0, -3.0, 2.2, 3.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, -2.2, 3.3}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.2, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.0, 2.0, 2.2, 2.7, -3.3, -5.9}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {0.0, 1.0, 2.2, 3.3, 0.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {0.0, 3.0, 1.0, -2.2, 2.2, -1.7, 3.3, 14.5, 0.0, -9.0}; + + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = BLASFUNC(dsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, -1.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, -1.5, 1.0, 1.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(4.3, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, -1.0, -3.0, 2.2, 3.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, -2.2, 3.3}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(3.2, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.0, 2.0, 2.2, 2.7, -3.3, -5.9}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {0.0, 1.0, 2.2, 3.3, 0.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {0.0, 3.0, 1.0, -2.2, 2.2, -1.7, 3.3, 14.5, 0.0, -9.0}; + + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dsum by comparing it against pre-calculated values + */ +CTEST(dsum, c_api_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = cblas_dsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dzamax.c b/utest/test_extensions/test_dzamax.c new file mode 100644 index 000000000..edea3de8f --- /dev/null +++ b/utest/test_extensions/test_dzamax.c @@ -0,0 +1,294 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX16 + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, step_zero){ + blasint i; + blasint N = ELEMENTS * 2, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.0, 2.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.0, -2.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -3.0, -1.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -3.0, -1.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i; + } + x[7 * inc * 2] = 1000.0; + x[7 * inc * 2 + 1] = 1000.0; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = -i; + } + x[7 * inc * 2] = 1000.0; + x[7 * inc * 2 + 1] = 1000.0; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i; + } + x[7 * inc * 2] = 1000.0; + x[7 * inc * 2 + 1] = 1000.0; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS); +} + +/** + * Test dzamax by comparing it against pre-calculated values + */ +CTEST(dzamax, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = -i; + } + x[7 * inc * 2] = 1000.0; + x[7 * inc * 2 + 1] = 1000.0; + double amax = BLASFUNC(dzamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dzamin.c b/utest/test_extensions/test_dzamin.c new file mode 100644 index 000000000..916eede92 --- /dev/null +++ b/utest/test_extensions/test_dzamin.c @@ -0,0 +1,310 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX16 + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, step_zero){ + blasint i; + blasint N = ELEMENTS * 2, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.0, 2.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.0, -2.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} + +/** + * Test dzamin by comparing it against pre-calculated values + */ +CTEST(dzamin, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + double amin = BLASFUNC(dzamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_dzsum.c b/utest/test_extensions/test_dzsum.c new file mode 100644 index 000000000..5139f59cb --- /dev/null +++ b/utest/test_extensions/test_dzsum.c @@ -0,0 +1,403 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX16 + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1, -1.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0, 2.3, -1.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, -1.0, 2.3, -1.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.4, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, -1.5, 1.1, -1.0, 1.0, 1.0, 1.1, -1.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.6, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2, 1.1, -1.0, 0.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.4, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, -1.0, 0.0, -1.0, -3.0, -1.0, 0.0, 2.2, 3.0, -1.0, 0.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, -2.2, 3.3, 1.1, 1.0, -2.2, 3.3}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.4, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.1, 1.0, 1.0, 2.0, 1.1, 1.0, 2.2, 2.7, 1.1, 1.0, -3.3, -5.9}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(-0.2, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {0.0, 1.0, 2.2, 3.3, 0.0, 0.0, 1.0, 2.2, 3.3, 0.0}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(13.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {0.0, 3.0, 1.0, 2.2, 1.0, -2.2, 1.0, 2.2, 2.2, -1.7, 1.0, 2.2, 3.3, 14.5, 1.0, 2.2, 0.0, -9.0, 1.0, 2.2}; + + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(11.1, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = BLASFUNC(dzsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1, -1.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0, 2.3, -1.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, -1.0, 2.3, -1.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.4, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, -1.5, 1.1, -1.0, 1.0, 1.0, 1.1, -1.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.6, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2, 1.1, -1.0, 0.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(4.4, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, -1.0, 0.0, -1.0, -3.0, -1.0, 0.0, 2.2, 3.0, -1.0, 0.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, -2.2, 3.3, 1.1, 1.0, -2.2, 3.3}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.4, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.1, 1.0, 1.0, 2.0, 1.1, 1.0, 2.2, 2.7, 1.1, 1.0, -3.3, -5.9}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(-0.2, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {0.0, 1.0, 2.2, 3.3, 0.0, 0.0, 1.0, 2.2, 3.3, 0.0}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(13.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {0.0, 3.0, 1.0, 2.2, 1.0, -2.2, 1.0, 2.2, 2.2, -1.7, 1.0, 2.2, 3.3, 14.5, 1.0, 2.2, 0.0, -9.0, 1.0, 2.2}; + + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(11.1, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} + +/** + * C API specific test + * Test dzsum by comparing it against pre-calculated values + */ +CTEST(dzsum, c_api_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0 : 1.0; + } + double sum = cblas_dzsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_icamin.c b/utest/test_extensions/test_icamin.c new file mode 100644 index 000000000..cca464eac --- /dev/null +++ b/utest/test_extensions/test_icamin.c @@ -0,0 +1,625 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS * 2]; + for (i = 0; i < N * 2; i ++) { + x[i] = i - 1000; + } + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.0f, 2.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.0f, -2.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc * 2] = 0.0f; + x[(N - 1) * inc * 2 + 1] = 0.0f; + blasint index = BLASFUNC(icamin)(&N, x, &inc); + ASSERT_EQUAL(N, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS * 2]; + for (i = 0; i < N * 2; i ++) { + x[i] = i - 1000; + } + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.0f, 2.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.0f, -2.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test icamin by comparing it against pre-calculated values + */ +CTEST(icamin, c_api_min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc * 2] = 0.0f; + x[(N - 1) * inc * 2 + 1] = 0.0f; + blasint index = cblas_icamin(N, x, inc); + ASSERT_EQUAL(N - 1, index); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c new file mode 100644 index 000000000..9f099f666 --- /dev/null +++ b/utest/test_extensions/test_idamin.c @@ -0,0 +1,787 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_DOUBLE + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.1}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.1, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, 1.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.1, 1.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.1, 1.0, -2.2}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0}; + + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(N, index); +} + +/** + * Fortran API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, min_idx_in_vec_tail_inc_1){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * inc]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0f; + blasint index = BLASFUNC(idamin)(&N, x, &inc); + ASSERT_EQUAL(N, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.1}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.1}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.1, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.1, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.1, 1.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.1, 1.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.1, 1.0, 2.2}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.1, 1.0, -2.2}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {1.1, 1.0, 2.2, 3.3, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_5){ + blasint N = 5, inc = 1; + double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_5){ + blasint N = 5, inc = 2; + double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0}; + + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(N - 1, index); +} + +/** + * C API specific test + * Test idamin by comparing it against pre-calculated values + */ +CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * inc]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0; + blasint index = cblas_idamin(N, x, inc); + ASSERT_EQUAL(N - 1, index); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c new file mode 100644 index 000000000..df8dead07 --- /dev/null +++ b/utest/test_extensions/test_isamin.c @@ -0,0 +1,787 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_SINGLE + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.1f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.1f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, 1.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.1f, 1.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f}; + + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(5, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(9, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(N, index); +} + +/** + * Fortran API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, min_idx_in_vec_tail_inc_1){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * inc]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0f; + blasint index = BLASFUNC(isamin)(&N, x, &inc); + ASSERT_EQUAL(N, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.1f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.1f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, 1.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.1f, 1.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f}; + + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(4, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(8, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(N - 1, index); +} + +/** + * C API specific test + * Test isamin by comparing it against pre-calculated values + */ +CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * inc]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc] = 0.0f; + blasint index = cblas_isamin(N, x, inc); + ASSERT_EQUAL(N - 1, index); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_izamin.c b/utest/test_extensions/test_izamin.c new file mode 100644 index 000000000..a0bdae8e2 --- /dev/null +++ b/utest/test_extensions/test_izamin.c @@ -0,0 +1,625 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX16 + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS * 2]; + for (i = 0; i < N * 2; i ++) { + x[i] = i - 1000; + } + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(0, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.0, 2.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.0, -2.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(1, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(2, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(8, index); +} + +/** + * Fortran API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc * 2] = 0.0; + x[(N - 1) * inc * 2 + 1] = 0.0; + blasint index = BLASFUNC(izamin)(&N, x, &inc); + ASSERT_EQUAL(N, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + double x[ELEMENTS * 2]; + for (i = 0; i < N * 2; i ++) { + x[i] = i - 1000; + } + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {1.0, 2.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_1_N_1){ + blasint N = 1, inc = 1; + double x[] = {-1.0, -2.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_2_N_1){ + blasint N = 1, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(0, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_1_N_2){ + blasint N = 2, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_2_N_2){ + blasint N = 2, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_1_N_3){ + blasint N = 3, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_2_N_3){ + blasint N = 3, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_1_N_4){ + blasint N = 4, inc = 1; + double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_2_N_4){ + blasint N = 4, inc = 2; + double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0}; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(1, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + double x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_positive_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_negative_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0; + x[7 * inc * 2 + 1] = 0.0; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(7, index); +} + +/** + * C API specific test + * Test izamin by comparing it against pre-calculated values + */ +CTEST(izamin, c_api_min_idx_in_vec_tail){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + double x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + + x[(N - 1) * inc * 2] = 0.0; + x[(N - 1) * inc * 2 + 1] = 0.0; + blasint index = cblas_izamin(N, x, inc); + ASSERT_EQUAL(N - 1, index); +} +#endif diff --git a/utest/test_extensions/test_samin.c b/utest/test_extensions/test_samin.c new file mode 100644 index 000000000..5c747a0f6 --- /dev/null +++ b/utest/test_extensions/test_samin.c @@ -0,0 +1,354 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_SINGLE + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.1f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.1f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, 1.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.1f, 1.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 0.0f; + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + + x[8 * inc] = 1.0f; + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} + +/** + * Test samin by comparing it against pre-calculated values + */ +CTEST(samin, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = - i - 1000; + } + + x[8 * inc] = -1.0f; + float amin = BLASFUNC(samin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_saxpby.c b/utest/test_extensions/test_saxpby.c new file mode 100644 index 000000000..b4bd5cf0b --- /dev/null +++ b/utest/test_extensions/test_saxpby.c @@ -0,0 +1,794 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_SAXPBY { + float x_test[DATASIZE * INCREMENT]; + float x_verify[DATASIZE * INCREMENT]; + float y_test[DATASIZE * INCREMENT]; + float y_verify[DATASIZE * INCREMENT]; +}; +#ifdef BUILD_SINGLE +static struct DATA_SAXPBY data_saxpby; + +/** + * Fortran API specific function + * Test saxpby by comparing it with sscal and saxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static float check_saxpby(blasint n, float alpha, blasint incx, float beta, blasint incy) +{ + blasint i; + + // sscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + srand_generate(data_saxpby.x_test, n * incx_abs); + srand_generate(data_saxpby.y_test, n * incy_abs); + + // Copy vector x for saxpy + for (i = 0; i < n * incx_abs; i++) + data_saxpby.x_verify[i] = data_saxpby.x_test[i]; + + // Copy vector y for sscal + for (i = 0; i < n * incy_abs; i++) + data_saxpby.y_verify[i] = data_saxpby.y_test[i]; + + // Find beta*y + BLASFUNC(sscal)(&n, &beta, data_saxpby.y_verify, &incy_abs); + + // Find sum of alpha*x and beta*y + BLASFUNC(saxpy)(&n, &alpha, data_saxpby.x_verify, &incx, + data_saxpby.y_verify, &incy); + + BLASFUNC(saxpby)(&n, &alpha, data_saxpby.x_test, &incx, + &beta, data_saxpby.y_test, &incy); + + // Find the differences between output vector caculated by saxpby and saxpy + for (i = 0; i < n * incy_abs; i++) + data_saxpby.y_test[i] -= data_saxpby.y_verify[i]; + + // Find the norm of differences + return BLASFUNC(snrm2)(&n, data_saxpby.y_test, &incy_abs); +} + +/** + * C API specific function + * Test saxpby by comparing it with sscal and saxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static float c_api_check_saxpby(blasint n, float alpha, blasint incx, float beta, blasint incy) +{ + blasint i; + + // sscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Copy vector x for saxpy + for (i = 0; i < n * incx_abs; i++) + data_saxpby.x_verify[i] = data_saxpby.x_test[i]; + + // Copy vector y for sscal + for (i = 0; i < n * incy_abs; i++) + data_saxpby.y_verify[i] = data_saxpby.y_test[i]; + + // Find beta*y + cblas_sscal(n, beta, data_saxpby.y_verify, incy_abs); + + // Find sum of alpha*x and beta*y + cblas_saxpy(n, alpha, data_saxpby.x_verify, incx, + data_saxpby.y_verify, incy); + + cblas_saxpby(n, alpha, data_saxpby.x_test, incx, + beta, data_saxpby.y_test, incy); + + // Find the differences between output vector caculated by saxpby and saxpy + for (i = 0; i < n * incy_abs; i++) + data_saxpby.y_test[i] -= data_saxpby.y_verify[i]; + + // Find the norm of differences + return cblas_snrm2(n, data_saxpby.y_test, incy_abs); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(saxpby, inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(saxpby, inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(saxpby, inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(saxpby, inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha = 3.0f; + float beta = 4.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(saxpby, inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + float alpha = 5.0f; + float beta = 4.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(saxpby, inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + float alpha = 1.0f; + float beta = 6.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(saxpby, inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + float alpha = 7.0f; + float beta = 3.5f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(saxpby, inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 0.0f; + float beta = 1.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero +*/ +CTEST(saxpby, inc_x_1_inc_y_2_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 0.0f; + float beta = 1.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(saxpby, inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * Scalar beta is zero +*/ +CTEST(saxpby, inc_x_2_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(saxpby, inc_x_1_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(saxpby, inc_x_2_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(saxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 0.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(saxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 0.0f; + float beta = 0.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(saxpby, check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 1.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(saxpby, c_api_inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(saxpby, c_api_inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha = 3.0f; + float beta = 4.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(saxpby, c_api_inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + float alpha = 5.0f; + float beta = 4.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(saxpby, c_api_inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + float alpha = 1.0f; + float beta = 6.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(saxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + float alpha = 7.0f; + float beta = 3.5f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 0.0f; + float beta = 1.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero +*/ +CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 0.0f; + float beta = 1.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * Scalar beta is zero +*/ +CTEST(saxpby, c_api_inc_x_2_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * Scalar beta is zero +*/ +CTEST(saxpby, c_api_inc_x_2_inc_y_2_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + float alpha = 0.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test saxpby by comparing it with sscal and saxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + float alpha = 0.0f; + float beta = 0.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(saxpby, c_api_check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + float alpha = 1.0f; + float beta = 1.0f; + + float norm = c_api_check_saxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_scamax.c b/utest/test_extensions/test_scamax.c new file mode 100644 index 000000000..39d7201ff --- /dev/null +++ b/utest/test_extensions/test_scamax.c @@ -0,0 +1,294 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, step_zero){ + blasint i; + blasint N = ELEMENTS * 2, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.0f, 2.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.0f, -2.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -3.0f, -1.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 3.0f, 1.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -3.0f, -1.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i; + } + x[7 * inc * 2] = 1000.0f; + x[7 * inc * 2 + 1] = 1000.0f; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = -i; + } + x[7 * inc * 2] = 1000.0f; + x[7 * inc * 2 + 1] = 1000.0f; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i; + } + x[7 * inc * 2] = 1000.0f; + x[7 * inc * 2 + 1] = 1000.0f; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS); +} + +/** + * Test scamax by comparing it against pre-calculated values + */ +CTEST(scamax, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = -i; + } + x[7 * inc * 2] = 1000.0f; + x[7 * inc * 2 + 1] = 1000.0f; + float amax = BLASFUNC(scamax)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_scamin.c b/utest/test_extensions/test_scamin.c new file mode 100644 index 000000000..4baa23184 --- /dev/null +++ b/utest/test_extensions/test_scamin.c @@ -0,0 +1,310 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 70 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, step_zero){ + blasint i; + blasint N = ELEMENTS * 2, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.0f, 2.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {-1.0f, -2.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f}; + + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_1_N_70){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, positive_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} + +/** + * Test scamin by comparing it against pre-calculated values + */ +CTEST(scamin, negative_step_2_N_70){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = - i - 1000; + } + x[7 * inc * 2] = 0.0f; + x[7 * inc * 2 + 1] = 0.0f; + float amin = BLASFUNC(scamin)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_scsum.c b/utest/test_extensions/test_scsum.c new file mode 100644 index 000000000..492e1a4ca --- /dev/null +++ b/utest/test_extensions/test_scsum.c @@ -0,0 +1,403 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_COMPLEX + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f, -1.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f, 2.3f, -1.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, -1.0f, 2.3f, -1.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.4f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, -1.5f, 1.1f, -1.0f, 1.0f, 1.0f, 1.1f, -1.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.6f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 1.1f, -1.0f, 0.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.4f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, -1.0f, 0.0f, -1.0f, -3.0f, -1.0f, 0.0f, 2.2f, 3.0f, -1.0f, 0.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, -2.2f, 3.3f, 1.1f, 1.0f, -2.2f, 3.3f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.4f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.1f, 1.0f, 1.0f, 2.0f, 1.1f, 1.0f, 2.2f, 2.7f, 1.1f, 1.0f, -3.3f, -5.9f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(-0.2f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f, 0.0f, 1.0f, 2.2f, 3.3f, 0.0f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(13.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {0.0f, 3.0f, 1.0f, 2.2f, 1.0f, -2.2f, 1.0f, 2.2f, 2.2f, -1.7f, 1.0f, 2.2f, 3.3f, 14.5f, 1.0f, 2.2f, 0.0f, -9.0f, 1.0f, 2.2f}; + + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(11.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = BLASFUNC(scsum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < ELEMENTS * inc * 2; i ++) { + x[i] = 1000 - i; + } + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f, -1.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f, 2.3f, -1.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, -1.0f, 2.3f, -1.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.4f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, -1.5f, 1.1f, -1.0f, 1.0f, 1.0f, 1.1f, -1.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.6f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f, 1.1f, -1.0f, 0.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(4.4f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, -1.0f, 0.0f, -1.0f, -3.0f, -1.0f, 0.0f, 2.2f, 3.0f, -1.0f, 0.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, -2.2f, 3.3f, 1.1f, 1.0f, -2.2f, 3.3f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.4f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.1f, 1.0f, 1.0f, 2.0f, 1.1f, 1.0f, 2.2f, 2.7f, 1.1f, 1.0f, -3.3f, -5.9f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(-0.2f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f, 0.0f, 1.0f, 2.2f, 3.3f, 0.0f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(13.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {0.0f, 3.0f, 1.0f, 2.2f, 1.0f, -2.2f, 1.0f, 2.2f, 2.2f, -1.7f, 1.0f, 2.2f, 3.3f, 14.5f, 1.0f, 2.2f, 0.0f, -9.0f, 1.0f, 2.2f}; + + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(11.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test scsum by comparing it against pre-calculated values + */ +CTEST(scsum, c_api_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT * 2]; + for (i = 0; i < N * inc * 2; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = cblas_scsum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_sgeadd.c b/utest/test_extensions/test_sgeadd.c new file mode 100644 index 000000000..b42ce9c0e --- /dev/null +++ b/utest/test_extensions/test_sgeadd.c @@ -0,0 +1,880 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 + +struct DATA_SGEADD +{ + float a_test[M * N]; + float c_test[M * N]; + float c_verify[M * N]; +}; + +#ifdef BUILD_SINGLE +static struct DATA_SGEADD data_sgeadd; + +/** + * sgeadd reference implementation + * + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param aptr - refer to matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param cptr - refer to matrix C + * param ldc - leading dimension of C + */ +static void sgeadd_trusted(blasint m, blasint n, float alpha, float *aptr, + blasint lda, float beta, float *cptr, blasint ldc) +{ + blasint i; + + for (i = 0; i < n; i++) + { + cblas_saxpby(m, alpha, aptr, 1, beta, cptr, 1); + aptr += lda; + cptr += ldc; + } +} + +/** + * Test sgeadd by comparing it against reference + * Compare with the following options: + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static float check_sgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, float alpha, blasint lda, + float beta, blasint ldc) +{ + blasint i; + blasint cols = m, rows = n; + + if (order == CblasRowMajor) + { + rows = m; + cols = n; + } + + // Fill matrix A, C + srand_generate(data_sgeadd.a_test, lda * rows); + srand_generate(data_sgeadd.c_test, ldc * rows); + + // Copy matrix C for sgeadd + for (i = 0; i < ldc * rows; i++) + data_sgeadd.c_verify[i] = data_sgeadd.c_test[i]; + + sgeadd_trusted(cols, rows, alpha, data_sgeadd.a_test, lda, + beta, data_sgeadd.c_verify, ldc); + + if (api == 'F') + BLASFUNC(sgeadd) + (&m, &n, &alpha, data_sgeadd.a_test, &lda, + &beta, data_sgeadd.c_test, &ldc); + else + cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda, + beta, data_sgeadd.c_test, ldc); + + // Find the differences between output matrix caculated by sgeadd and sgemm + return smatrix_difference(data_sgeadd.c_test, data_sgeadd.c_verify, cols, rows, ldc); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param lda - leading dimension of A + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in sgeadd + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, blasint lda, + blasint ldc, int expected_info) +{ + float alpha = 1.0f; + float beta = 1.0f; + + set_xerbla("SGEADD ", expected_info); + + if (api == 'F') + BLASFUNC(sgeadd) + (&m, &n, &alpha, data_sgeadd.a_test, &lda, + &beta, data_sgeadd.c_test, &ldc); + else + cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda, + beta, data_sgeadd.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(sgeadd, matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 3.0f; + float beta = 3.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(sgeadd, matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 0.0f; + float beta = 2.5f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(sgeadd, matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 3.0f; + float beta = 0.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(sgeadd, matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 0.0f; + float beta = 0.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(sgeadd, matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n - + * number of columns of A and C + * Must be at least zero. + */ +CTEST(sgeadd, xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = m; + blasint ldc = m; + + int expected_info = 2; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + */ +CTEST(sgeadd, xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + */ +CTEST(sgeadd, xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 6; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + */ +CTEST(sgeadd, xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Check if n - number of columns of A, C equal zero. + */ +CTEST(sgeadd, n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Check if m - number of rows of A and C equal zero. + */ +CTEST(sgeadd, m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(sgeadd, c_api_matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 2.0f; + float beta = 3.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(sgeadd, c_api_matrix_n_100_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 4.0f; + float beta = 2.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(sgeadd, c_api_matrix_n_50_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N / 2; + blasint m = M; + + blasint lda = n; + blasint ldc = n; + + float alpha = 3.0f; + float beta = 1.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(sgeadd, c_api_matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 0.0f; + float beta = 1.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(sgeadd, c_api_matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 3.0f; + float beta = 0.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(sgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + float alpha = 0.0f; + float beta = 0.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(sgeadd, c_api_matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + float alpha = 3.0f; + float beta = 4.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test error function for an invalid param order - + * specifies whether A and C stored in + * row-major order or column-major order + */ +CTEST(sgeadd, c_api_xerbla_invalid_order) +{ + CBLAS_ORDER order = INVALID; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 0; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(sgeadd, c_api_xerbla_n_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(sgeadd, c_api_xerbla_m_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(sgeadd, c_api_xerbla_lda_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(sgeadd, c_api_xerbla_ldc_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Check if n - number of columns of A, C equal zero. + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Check if m - number of rows of A and C equal zero. + * + * c api option order is column-major order + */ +CTEST(sgeadd, c_api_m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + float alpha = 1.0f; + float beta = 1.0f; + + float norm = check_sgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_sgemmt.c b/utest/test_extensions/test_sgemmt.c new file mode 100644 index 000000000..5b51e3579 --- /dev/null +++ b/utest/test_extensions/test_sgemmt.c @@ -0,0 +1,1442 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_SGEMMT { + float a_test[DATASIZE * DATASIZE]; + float b_test[DATASIZE * DATASIZE]; + float c_test[DATASIZE * DATASIZE]; + float c_verify[DATASIZE * DATASIZE]; + float c_gemm[DATASIZE * DATASIZE]; +}; + +#ifdef BUILD_SINGLE +static struct DATA_SGEMMT data_sgemmt; + +/** + * Compute gemmt via gemm since gemmt is gemm but updates only + * the upper or lower triangular part of the result matrix + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + */ +static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, float alpha, blasint lda, + blasint ldb, float beta, blasint ldc) +{ + blasint i, j; + + if(api == 'F') + BLASFUNC(sgemm)(&transa, &transb, &m, &m, &k, &alpha, data_sgemmt.a_test, &lda, + data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_gemm, &ldc); + else + cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda, + data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc); + + if (uplo == 'L' || uplo == CblasLower) + { + for (i = 0; i < m; i++) + for (j = i; j < m; j++) + data_sgemmt.c_verify[i * ldc + j] = + data_sgemmt.c_gemm[i * ldc + j]; + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i; j++) + data_sgemmt.c_verify[i * ldc + j] = + data_sgemmt.c_gemm[i * ldc + j]; + } +} + +/** + * Comapare results computed by sgemmt and sgemmt_trusted + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static float check_sgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, float alpha, blasint lda, + blasint ldb, float beta, blasint ldc) +{ + blasint i; + blasint b_cols; + blasint a_cols; + blasint inc = 1; + blasint size_c = m * ldc; + + if(order == CblasColMajor){ + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = m; + else a_cols = k; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = k; + else b_cols = m; + } else { + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = k; + else a_cols = m; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = m; + else b_cols = k; + } + + srand_generate(data_sgemmt.a_test, a_cols * lda); + srand_generate(data_sgemmt.b_test, b_cols * ldb); + srand_generate(data_sgemmt.c_test, m * ldc); + + for (i = 0; i < m * ldc; i++) + data_sgemmt.c_gemm[i] = data_sgemmt.c_verify[i] = data_sgemmt.c_test[i]; + + sgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc); + + if (api == 'F') + BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test, + &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc); + else + cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda, + data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc); + + for (i = 0; i < m * ldc; i++) + data_sgemmt.c_verify[i] -= data_sgemmt.c_test[i]; + + return BLASFUNC(snrm2)(&size_c, data_sgemmt.c_verify, &inc) / size_c; +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in sgemmt + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, blasint lda, blasint ldb, + blasint ldc, int expected_info) +{ + float alpha = 1.0f; + float beta = 0.0f; + + set_xerbla("SGEMMT ", expected_info); + + if (api == 'F') + BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test, + &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc); + else + cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda, + data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'T'; + char uplo = 'U'; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'U'; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(sgemmt, upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + char transa = 'T', transb = 'N'; + char uplo = 'L'; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'T'; + char uplo = 'L'; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'L'; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(sgemmt, lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, c_api_colmajor_upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, c_api_colmajor_upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, c_api_colmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(sgemmt, c_api_colmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, c_api_colmajor_lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 100; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, c_api_colmajor_lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, c_api_colmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(sgemmt, c_api_colmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, c_api_rowmajor_upper_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 100; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 100, ldb = 100, ldc = 50; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, c_api_rowmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * beta = 1.0 + */ +CTEST(sgemmt, c_api_rowmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(sgemmt, c_api_rowmajor_lower_M_100_K_50_a_trans_b_notrans) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 100; + float alpha = 1.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_100_a_notrans_b_trans) +{ + blasint M = 50, K = 100; + blasint lda = 100, ldb = 100, ldc = 50; + float alpha = 1.0f; + float beta = 0.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 1.5f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * alpha = 0.0 + */ +CTEST(sgemmt, c_api_rowmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 0.0f; + float beta = 2.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test sgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * beta = 1.0 + */ +CTEST(sgemmt, c_api_rowmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + float alpha = 2.0f; + float beta = 1.0f; + + float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param uplo. + * Must be upper (U) or lower (L). + */ +CTEST(sgemmt, xerbla_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transa. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(sgemmt, xerbla_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'O', transb = 'N'; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transb. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(sgemmt, xerbla_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'O'; + char uplo = 'U'; + int expected_info = 3; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(sgemmt, xerbla_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 4; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(sgemmt, xerbla_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 5; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(sgemmt, xerbla_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 8; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(sgemmt, xerbla_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 10; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(sgemmt, xerbla_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 13; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. + * Test error function for an invalid param order. + * Must be column or row major. + */ +CTEST(sgemmt, xerbla_c_api_major_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 0; + + int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(sgemmt, xerbla_c_api_colmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B transposed. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(sgemmt, xerbla_c_api_rowmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c new file mode 100644 index 000000000..0d9c44e73 --- /dev/null +++ b/utest/test_extensions/test_simatcopy.c @@ -0,0 +1,947 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_SIMATCOPY { + float a_test[DATASIZE* DATASIZE]; + float a_verify[DATASIZE* DATASIZE]; +}; + +#ifdef BUILD_SINGLE +static struct DATA_SIMATCOPY data_simatcopy; + +/** + * Comapare results computed by simatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * return norm of difference between openblas and reference func + */ +static float check_simatcopy(char api, char order, char trans, blasint rows, blasint cols, float alpha, + blasint lda_src, blasint lda_dst) +{ + blasint m, n; + blasint rows_out, cols_out; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + if (order == 'C') { + n = rows; m = cols; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + rows_out = n; cols_out = m; + } + else { + rows_out = m; cols_out = n; + } + + srand_generate(data_simatcopy.a_test, lda_src*m); + + if (trans == 'T' || trans == 'C') { + stranspose(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); + } + else { + scopy(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); + } + + if (api == 'F') { + BLASFUNC(simatcopy)(&order, &trans, &rows, &cols, &alpha, data_simatcopy.a_test, + &lda_src, &lda_dst); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_simatcopy(corder, ctrans, rows, cols, alpha, data_simatcopy.a_test, + lda_src, lda_dst); + } + + // Find the differences between output matrix computed by simatcopy and reference func + return smatrix_difference(data_simatcopy.a_test, data_simatcopy.a_verify, cols_out, rows_out, lda_dst); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda_src, blasint lda_dst, int expected_info) +{ + float alpha = 1.0f; + + set_xerbla("SIMATCOPY", expected_info); + + BLASFUNC(simatcopy)(&order, &trans, &rows, &cols, &alpha, data_simatcopy.a_test, + &lda_src, &lda_dst); + + return check_error(); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0f + */ +CTEST(simatcopy, colmajor_trans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0f + */ +CTEST(simatcopy, colmajor_notrans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 0.0f + */ +CTEST(simatcopy, colmajor_trans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 0.0f + */ +CTEST(simatcopy, colmajor_notrans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 1.0f + */ +CTEST(simatcopy, colmajor_trans_col_50_row_100_alpha_one) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 1.0f + */ +CTEST(simatcopy, colmajor_notrans_col_50_row_100_alpha_one) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 0.0f + */ +CTEST(simatcopy, colmajor_trans_col_50_row_100_alpha_zero) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 0.0f + */ +CTEST(simatcopy, colmajor_notrans_col_50_row_100_alpha_zero) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 2.0f + */ +CTEST(simatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 2.0f + */ +CTEST(simatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_100_alpha_one) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 0.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 0.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_100_alpha_zero) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific tests + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 1.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_50_alpha_one) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 1.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_50_alpha_one) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 0.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_50_alpha_zero) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 0.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_50_alpha_zero) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 2.0f + */ +CTEST(simatcopy, rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 2.0f + */ +CTEST(simatcopy, rowmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test simatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 2.0f + */ +CTEST(simatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(simatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(simatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param m. + * Must be positive. + */ +CTEST(simatcopy, xerbla_invalid_rows) +{ + blasint m = 0, n = 100; + blasint lda_src = 0, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param n. + * Must be positive. + */ +CTEST(simatcopy, xerbla_invalid_cols) +{ + blasint m = 100, n = 0; + blasint lda_src = 100, lda_dst = 0; + char order = 'C'; + char trans = 'T'; + int expected_info = 4; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using row major layout, + * lda_src must be at least n. + */ +CTEST(simatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using column major layout, + * lda_src must be at least m. + */ +CTEST(simatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is no transposition, lda_dst must be at least n. + */ +CTEST(simatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is transposition, lda_dst must be at least m. + */ +CTEST(simatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is no transposition, lda_dst must be at least m. + */ +CTEST(simatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is transposition, lda_dst must be at least n. + */ +CTEST(simatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_somatcopy.c b/utest/test_extensions/test_somatcopy.c new file mode 100644 index 000000000..c75bbc75e --- /dev/null +++ b/utest/test_extensions/test_somatcopy.c @@ -0,0 +1,672 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_SOMATCOPY { + float a_test[DATASIZE * DATASIZE]; + float b_test[DATASIZE * DATASIZE]; + float b_verify[DATASIZE * DATASIZE]; +}; + +#ifdef BUILD_SINGLE +static struct DATA_SOMATCOPY data_somatcopy; + +/** + * Comapare results computed by somatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param alpha - scaling factor for matrix B + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * return norm of difference between openblas and reference func + */ +static float check_somatcopy(char api, char order, char trans, blasint rows, blasint cols, float alpha, + blasint lda, blasint ldb) +{ + blasint b_rows, b_cols; + blasint m, n; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + if (order == 'C') { + m = cols; n = rows; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + b_rows = n; b_cols = m; + } + else { + b_rows = m; b_cols = n; + } + + srand_generate(data_somatcopy.a_test, lda*m); + + if (trans == 'T' || trans == 'C') { + stranspose(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); + } + else { + scopy(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); + } + + if (api == 'F') { + BLASFUNC(somatcopy)(&order, &trans, &rows, &cols, &alpha, data_somatcopy.a_test, + &lda, data_somatcopy.b_test, &ldb); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_somatcopy(corder, ctrans, rows, cols, alpha, data_somatcopy.a_test, + lda, data_somatcopy.b_test, ldb); + } + + return smatrix_difference(data_somatcopy.b_test, data_somatcopy.b_verify, b_cols, b_rows, ldb); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda, blasint ldb, int expected_info) +{ + float alpha = 1.0; + + set_xerbla("SOMATCOPY", expected_info); + + BLASFUNC(somatcopy)(&order, &trans, &rows, &cols, &alpha, data_somatcopy.a_test, + &lda, data_somatcopy.b_test, &ldb); + + return check_error(); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(somatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + float alpha = 2.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(somatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(somatcopy, colmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 0.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(somatcopy, colmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 50; + char order = 'C'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(somatcopy, rowmajor_conjtrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'C'; // same as trans for real matrix + float alpha = 2.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 2.0 + */ +CTEST(somatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + float alpha = 2.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Matrix dimensions leave residues from 4 and 2 (specialize + * for rt case) + * alpha = 1.5 + */ +CTEST(somatcopy, rowmajor_trans_col_27_row_27) +{ + blasint m = 27, n = 27; + blasint lda = 27, ldb = 27; + char order = 'R'; + char trans = 'T'; + float alpha = 1.5f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha = 0.0 + */ +CTEST(somatcopy, rowmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 0.0f; + + float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'T'; + float alpha = 1.0f; + + float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * C API specific test + * Test somatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha = 1.0 + */ +CTEST(somatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'N'; + float alpha = 1.0f; + + float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(somatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(somatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param m. + * Must be positive. + */ +CTEST(somatcopy, xerbla_invalid_rows) +{ + blasint m = 0, n = 100; + blasint lda = 0, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param n. + * Must be positive. + */ +CTEST(somatcopy, xerbla_invalid_cols) +{ + blasint m = 100, n = 0; + blasint lda = 100, ldb = 0; + char order = 'C'; + char trans = 'T'; + int expected_info = 4; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using row major layout, + * lda must be at least n. + */ +CTEST(somatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda. + * If matrices are stored using column major layout, + * lda must be at least m. + */ +CTEST(somatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is no transposition, ldb must be at least n. + */ +CTEST(somatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using row major layout and + * there is transposition, ldb must be at least m. + */ +CTEST(somatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is no transposition, ldb must be at least m. + */ +CTEST(somatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param ldb. + * If matrices are stored using column major layout and + * there is transposition, ldb must be at least n. + */ +CTEST(somatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_srotmg.c b/utest/test_extensions/test_srotmg.c new file mode 100644 index 000000000..3c97e3b4d --- /dev/null +++ b/utest/test_extensions/test_srotmg.c @@ -0,0 +1,414 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#ifdef BUILD_SINGLE + +/** + * Fortran API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, y1_zero) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 0.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0f; + tr_d2 = 2.0f; + tr_x1 = 8.0f; + tr_y1 = 0.0f; + + tr_param[0] = -2.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * Fortran API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, d1_negative) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = -1.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0f; + tr_d2 = 0.0f; + tr_x1 = 0.0f; + tr_y1 = 8.0f; + + tr_param[0] = -1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * Fortran API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, d1_positive_d2_positive_x1_zero) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 0.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0f; + tr_d2 = 2.0f; + tr_x1 = 8.0f; + tr_y1 = 8.0f; + + tr_param[0] = 1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * Fortran API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, scaled_y_greater_than_scaled_x) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 1.0f; + te_d2 = tr_d2 = -2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0f; + tr_d2 = 0.0f; + tr_x1 = 0.0f; + tr_y1 = 8.0f; + + tr_param[0] = -1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * C API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, c_api_y1_zero) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 0.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0f; + tr_d2 = 2.0f; + tr_x1 = 8.0f; + tr_y1 = 0.0f; + + tr_param[0] = -2.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * C API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, c_api_d1_negative) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = -1.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0f; + tr_d2 = 0.0f; + tr_x1 = 0.0f; + tr_y1 = 8.0f; + + tr_param[0] = -1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * C API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, c_api_d1_positive_d2_positive_x1_zero) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 2.0f; + te_d2 = tr_d2 = 2.0f; + te_x1 = tr_x1 = 0.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 2.0f; + tr_d2 = 2.0f; + tr_x1 = 8.0f; + tr_y1 = 8.0f; + + tr_param[0] = 1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} + +/** + * C API specific test + * Test srotmg by comparing it against pre-calculated values + */ +CTEST(srotmg, c_api_scaled_y_greater_than_scaled_x) +{ + float te_d1, tr_d1; + float te_d2, tr_d2; + float te_x1, tr_x1; + float te_y1, tr_y1; + float te_param[5]; + float tr_param[5]; + int i = 0; + te_d1 = tr_d1 = 1.0f; + te_d2 = tr_d2 = -2.0f; + te_x1 = tr_x1 = 8.0f; + te_y1 = tr_y1 = 8.0f; + + for(i=0; i<5; i++){ + te_param[i] = tr_param[i] = 0.0f; + } + + //reference values as calculated by netlib blas + tr_d1 = 0.0f; + tr_d2 = 0.0f; + tr_x1 = 0.0f; + tr_y1 = 8.0f; + + tr_param[0] = -1.0f; + tr_param[1] = 0.0f; + tr_param[2] = 0.0f; + tr_param[3] = 0.0f; + tr_param[4] = 0.0f; + + //OpenBLAS + cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param); + + ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS); + + for(i=0; i<5; i++){ + ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS); + } +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_ssum.c b/utest/test_extensions/test_ssum.c new file mode 100644 index 000000000..971a0d2e0 --- /dev/null +++ b/utest/test_extensions/test_ssum.c @@ -0,0 +1,403 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#define ELEMENTS 50 +#define INCREMENT 2 + +#ifdef BUILD_SINGLE + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, -1.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, -1.5f, 1.0f, 1.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.1f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(4.3f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, -1.0f, -3.0f, 2.2f, 3.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, -2.2f, 3.3f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(3.2f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 2.0f, 2.2f, 2.7f, -3.3f, -5.9f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(1.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {0.0f, 3.0f, 1.0f, -2.2f, 2.2f, -1.7f, 3.3f, 14.5f, 0.0f, -9.0f}; + + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * Fortran API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = BLASFUNC(ssum)(&N, x, &inc); + ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_bad_args_N_0){ + blasint i; + blasint N = 0, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < ELEMENTS * inc; i ++) { + x[i] = 1000 - i; + } + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_zero){ + blasint i; + blasint N = ELEMENTS, inc = 0; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = i + 1000; + } + x[8] = 0.0f; + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_1){ + blasint N = 1, inc = 1; + float x[] = {1.1f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_1){ + blasint N = 1, inc = 2; + float x[] = {1.1f, 0.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_2){ + blasint N = 2, inc = 1; + float x[] = {1.1f, -1.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_2){ + blasint N = 2, inc = 2; + float x[] = {1.1f, -1.5f, 1.0f, 1.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.1f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_3){ + blasint N = 3, inc = 1; + float x[] = {1.1f, 1.0f, 2.2f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(4.3f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_3){ + blasint N = 3, inc = 2; + float x[] = {1.1f, 0.0f, -1.0f, -3.0f, 2.2f, 3.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_4){ + blasint N = 4, inc = 1; + float x[] = {1.1f, 1.0f, -2.2f, 3.3f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(3.2f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_4){ + blasint N = 4, inc = 2; + float x[] = {1.1f, 0.0f, 1.0f, 2.0f, 2.2f, 2.7f, -3.3f, -5.9f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(1.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_5){ + blasint N = 5, inc = 1; + float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_5){ + blasint N = 5, inc = 2; + float x[] = {0.0f, 3.0f, 1.0f, -2.2f, 2.2f, -1.7f, 3.3f, 14.5f, 0.0f, -9.0f}; + + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_1_N_50){ + blasint i; + blasint N = ELEMENTS, inc = 1; + float x[ELEMENTS]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS); +} + +/** + * C API specific test + * Test ssum by comparing it against pre-calculated values + */ +CTEST(ssum, c_api_step_2_N_50){ + blasint i; + blasint N = ELEMENTS, inc = INCREMENT; + float x[ELEMENTS * INCREMENT]; + for (i = 0; i < N * inc; i ++) { + x[i] = (i & 1) ? -1.0f : 1.0f; + } + float sum = cblas_ssum(N, x, inc); + ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zaxpby.c b/utest/test_extensions/test_zaxpby.c new file mode 100644 index 000000000..6148f44c5 --- /dev/null +++ b/utest/test_extensions/test_zaxpby.c @@ -0,0 +1,630 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZAXPBY { + double x_test[DATASIZE * INCREMENT * 2]; + double x_verify[DATASIZE * INCREMENT * 2]; + double y_test[DATASIZE * INCREMENT * 2]; + double y_verify[DATASIZE * INCREMENT * 2]; +}; +#ifdef BUILD_COMPLEX16 +static struct DATA_ZAXPBY data_zaxpby; + +/** + * Fortran API specific function + * Test zaxpby by comparing it with zscal and zaxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static double check_zaxpby(blasint n, double *alpha, blasint incx, double *beta, blasint incy) +{ + blasint i; + + // zscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + drand_generate(data_zaxpby.x_test, n * incx_abs * 2); + drand_generate(data_zaxpby.y_test, n * incy_abs * 2); + + // Copy vector x for zaxpy + for (i = 0; i < n * incx_abs * 2; i++) + data_zaxpby.x_verify[i] = data_zaxpby.x_test[i]; + + // Copy vector y for zscal + for (i = 0; i < n * incy_abs * 2; i++) + data_zaxpby.y_verify[i] = data_zaxpby.y_test[i]; + + // Find beta*y + BLASFUNC(zscal)(&n, beta, data_zaxpby.y_verify, &incy_abs); + + // Find sum of alpha*x and beta*y + BLASFUNC(zaxpy)(&n, alpha, data_zaxpby.x_verify, &incx, + data_zaxpby.y_verify, &incy); + + BLASFUNC(zaxpby)(&n, alpha, data_zaxpby.x_test, &incx, + beta, data_zaxpby.y_test, &incy); + + // Find the differences between output vector caculated by zaxpby and zaxpy + for (i = 0; i < n * incy_abs * 2; i++) + data_zaxpby.y_test[i] -= data_zaxpby.y_verify[i]; + + // Find the norm of differences + return BLASFUNC(dznrm2)(&n, data_zaxpby.y_test, &incy_abs); +} + +/** + * C API specific function + * Test zaxpby by comparing it with zscal and zaxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param beta - scalar beta + * param incy - increment for the elements of y + * return norm of difference + */ +static double c_api_check_zaxpby(blasint n, double *alpha, blasint incx, double *beta, blasint incy) +{ + blasint i; + + // zscal accept only positive increments + blasint incx_abs = labs(incx); + blasint incy_abs = labs(incy); + + // Fill vectors x, y + drand_generate(data_zaxpby.x_test, n * incx_abs * 2); + drand_generate(data_zaxpby.y_test, n * incy_abs * 2); + + // Copy vector x for zaxpy + for (i = 0; i < n * incx_abs * 2; i++) + data_zaxpby.x_verify[i] = data_zaxpby.x_test[i]; + + // Copy vector y for zscal + for (i = 0; i < n * incy_abs * 2; i++) + data_zaxpby.y_verify[i] = data_zaxpby.y_test[i]; + + // Find beta*y + cblas_zscal(n, beta, data_zaxpby.y_verify, incy_abs); + + // Find sum of alpha*x and beta*y + cblas_zaxpy(n, alpha, data_zaxpby.x_verify, incx, + data_zaxpby.y_verify, incy); + + cblas_zaxpby(n, alpha, data_zaxpby.x_test, incx, + beta, data_zaxpby.y_test, incy); + + // Find the differences between output vector caculated by zaxpby and zaxpy + for (i = 0; i < n * incy_abs * 2; i++) + data_zaxpby.y_test[i] -= data_zaxpby.y_verify[i]; + + // Find the norm of differences + return cblas_dznrm2(n, data_zaxpby.y_test, incy_abs); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(zaxpby, inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(zaxpby, inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(zaxpby, inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(zaxpby, inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha[] = {3.0, 1.0}; + double beta[] = {4.0, 3.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(zaxpby, inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + double alpha[] = {5.0, 2.2}; + double beta[] = {4.0, 5.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(zaxpby, inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + double alpha[] = {1.0, 1.0}; + double beta[] = {6.0, 3.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(zaxpby, inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + double alpha[] = {7.0, 2.0}; + double beta[] = {3.5, 1.3}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(zaxpby, inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {0.0, 0.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(zaxpby, inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(zaxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(zaxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(zaxpby, check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(zaxpby, c_api_inc_x_2_inc_y_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(zaxpby, c_api_inc_x_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 2.1}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(zaxpby, c_api_inc_x_2_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha[] = {3.0, 2.0}; + double beta[] = {4.0, 3.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is 2 + */ +CTEST(zaxpby, c_api_inc_x_neg_1_inc_y_2_N_100) +{ + blasint n = DATASIZE, incx = -1, incy = 2; + double alpha[] = {5.0, 2.0}; + double beta[] = {4.0, 3.1}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is -1 + */ +CTEST(zaxpby, c_api_inc_x_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = 2, incy = -1; + double alpha[] = {1.0, 1.0}; + double beta[] = {6.0, 2.3}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is -1 + */ +CTEST(zaxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100) +{ + blasint n = DATASIZE, incx = -2, incy = -1; + double alpha[] = {7.0, 1.0}; + double beta[] = {3.5, 1.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + */ +CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {0.0, 0.0}; + double beta[] = {1.0, 1.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar beta is zero + */ +CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * Scalar alpha is zero + * Scalar beta is zero + */ +CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zaxpby by comparing it with zscal and zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * Scalar alpha is zero + * Scalar beta is zero +*/ +CTEST(zaxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Check if n - size of vectors x, y is zero + */ +CTEST(zaxpby, c_api_check_n_zero) +{ + blasint n = 0, incx = 1, incy = 1; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif diff --git a/utest/test_extensions/test_zaxpyc.c b/utest/test_extensions/test_zaxpyc.c new file mode 100644 index 000000000..7c11cd920 --- /dev/null +++ b/utest/test_extensions/test_zaxpyc.c @@ -0,0 +1,159 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZAXPYC { + double x_test[DATASIZE * INCREMENT * 2]; + double x_verify[DATASIZE * INCREMENT * 2]; + double y_test[DATASIZE * INCREMENT * 2]; + double y_verify[DATASIZE * INCREMENT * 2]; +}; +#ifdef BUILD_COMPLEX16 +static struct DATA_ZAXPYC data_zaxpyc; + +/** + * Test zaxpyc by conjugating vector x and comparing with zaxpy. + * Compare with the following options: + * + * param n - number of elements in vectors x and y + * param alpha - scalar alpha + * param incx - increment for the elements of x + * param incy - increment for the elements of y + * return norm of difference + */ +static double check_zaxpyc(blasint n, double *alpha, blasint incx, blasint incy) +{ + blasint i; + + drand_generate(data_zaxpyc.x_test, n * incx * 2); + drand_generate(data_zaxpyc.y_test, n * incy * 2); + + for (i = 0; i < n * incx * 2; i++) + data_zaxpyc.x_verify[i] = data_zaxpyc.x_test[i]; + + for (i = 0; i < n * incy * 2; i++) + data_zaxpyc.y_verify[i] = data_zaxpyc.y_test[i]; + + zconjugate_vector(n, incx, data_zaxpyc.x_verify); + + BLASFUNC(zaxpy) + (&n, alpha, data_zaxpyc.x_verify, &incx, + data_zaxpyc.y_verify, &incy); + + BLASFUNC(zaxpyc) + (&n, alpha, data_zaxpyc.x_test, &incx, + data_zaxpyc.y_test, &incy); + + for (i = 0; i < n * incy * 2; i++) + data_zaxpyc.y_verify[i] -= data_zaxpyc.y_test[i]; + + return BLASFUNC(dznrm2)(&n, data_zaxpyc.y_verify, &incy); +} + +/** + * Test zaxpyc by conjugating vector x and comparing with zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + */ +CTEST(zaxpyc, conj_strides_one) +{ + blasint n = DATASIZE, incx = 1, incy = 1; + double alpha[] = {5.0, 2.2}; + + double norm = check_zaxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zaxpyc by conjugating vector x and comparing with zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + */ +CTEST(zaxpyc, conj_incx_one) +{ + blasint n = DATASIZE, incx = 1, incy = 2; + double alpha[] = {5.0, 2.2}; + + double norm = check_zaxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zaxpyc by conjugating vector x and comparing with zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + */ +CTEST(zaxpyc, conj_incy_one) +{ + blasint n = DATASIZE, incx = 2, incy = 1; + double alpha[] = {5.0, 2.2}; + + double norm = check_zaxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zaxpyc by conjugating vector x and comparing with zaxpy. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + */ +CTEST(zaxpyc, conj_strides_two) +{ + blasint n = DATASIZE, incx = 2, incy = 2; + double alpha[] = {5.0, 2.2}; + + double norm = check_zaxpyc(n, alpha, incx, incy); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif diff --git a/utest/test_extensions/test_zgbmv.c b/utest/test_extensions/test_zgbmv.c new file mode 100644 index 000000000..55473361c --- /dev/null +++ b/utest/test_extensions/test_zgbmv.c @@ -0,0 +1,280 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 1 + +struct DATA_ZGBMV { + double a_test[DATASIZE * DATASIZE * 2]; + double a_band_storage[DATASIZE * DATASIZE * 2]; + double matrix[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * 2 * INCREMENT]; + double c_test[DATASIZE * 2 * INCREMENT]; + double c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX16 + +static struct DATA_ZGBMV data_zgbmv; + +/** + * Transform full-storage band matrix A to band-packed storage mode. + * + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * output param a - buffer for holding band-packed matrix + * param lda - specifies the leading dimension of a + * param matrix - buffer holding full-storage band matrix A + * param ldm - specifies the leading full-storage band matrix A + */ +static void transform_to_band_storage(blasint m, blasint n, blasint kl, + blasint ku, double* a, blasint lda, + double* matrix, blasint ldm) +{ + blasint i, j, k; + for (j = 0; j < n; j++) + { + k = 2 * (ku - j); + for (i = MAX(0, 2*(j - ku)); i < MIN(m, j + kl + 1) * 2; i+=2) + { + a[(k + i) + j * lda * 2] = matrix[i + j * ldm * 2]; + a[(k + i) + j * lda * 2 + 1] = matrix[i + j * ldm * 2 + 1]; + } + } +} + +/** + * Generate full-storage band matrix A with kl sub-diagonals and ku super-diagonals + * + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * output param band_matrix - buffer for full-storage band matrix. + * param matrix - buffer holding input general matrix + * param ldm - specifies the leading of input general matrix + */ +static void get_band_matrix(blasint m, blasint n, blasint kl, blasint ku, + double *band_matrix, double *matrix, blasint ldm) +{ + blasint i, j; + blasint k = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < m * 2; j += 2) + { + if ((blasint)(j/2) > kl + i || i > ku + (blasint)(j/2)) + { + band_matrix[i * ldm * 2 + j] = 0.0; + band_matrix[i * ldm * 2 + j + 1] = 0.0; + continue; + } + + band_matrix[i * ldm * 2 + j] = matrix[k++]; + band_matrix[i * ldm * 2 + j + 1] = matrix[k++]; + } + } +} + +/** + * Comapare results computed by zgbmv and zgemv + * since gbmv is gemv for band matrix + * + * param trans specifies op(A), the transposition operation applied to A + * param m - number of rows of A + * param n - number of columns of A + * param kl - number of sub-diagonals of the matrix A + * param ku - number of super-diagonals of the matrix A + * param alpha - scaling factor for the matrix-vector product + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static double check_zgbmv(char trans, blasint m, blasint n, blasint kl, blasint ku, + double *alpha, blasint lda, blasint inc_b, double *beta, blasint inc_c) +{ + blasint i; + blasint lenb, lenc; + + if(trans == 'T' || trans == 'C' || trans == 'D' || trans == 'U'){ + lenb = m; + lenc = n; + } else { + lenb = n; + lenc = m; + } + + drand_generate(data_zgbmv.matrix, m * n * 2); + drand_generate(data_zgbmv.b_test, 2 * (1 + (lenb - 1) * inc_b)); + drand_generate(data_zgbmv.c_test, 2 * (1 + (lenc - 1) * inc_c)); + + for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++) + data_zgbmv.c_verify[i] = data_zgbmv.c_test[i]; + + get_band_matrix(m, n, kl, ku, data_zgbmv.a_test, data_zgbmv.matrix, m); + + transform_to_band_storage(m, n, kl, ku, data_zgbmv.a_band_storage, lda, data_zgbmv.a_test, m); + + BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgbmv.a_test, &m, data_zgbmv.b_test, + &inc_b, beta, data_zgbmv.c_verify, &inc_c); + + BLASFUNC(zgbmv)(&trans, &m, &n, &kl, &ku, alpha, data_zgbmv.a_band_storage, &lda, data_zgbmv.b_test, + &inc_b, beta, data_zgbmv.c_test, &inc_c); + + for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++) + data_zgbmv.c_verify[i] -= data_zgbmv.c_test[i]; + + return BLASFUNC(dznrm2)(&lenc, data_zgbmv.c_verify, &inc_c); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is D + */ +CTEST(zgbmv, trans_D) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'D'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is O + */ +CTEST(zgbmv, trans_O) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 10; + blasint lda = 50; + char trans = 'O'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is S + */ +CTEST(zgbmv, trans_S) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 6, ku = 9; + blasint lda = 50; + char trans = 'S'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is U + */ +CTEST(zgbmv, trans_U) +{ + blasint m = 25, n = 50; + blasint inc_b = 1, inc_c = 1; + blasint kl = 7, ku = 11; + blasint lda = kl + ku + 1; + char trans = 'U'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is C + */ +CTEST(zgbmv, trans_C) +{ + blasint m = 50, n = 25; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'C'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgbmv by comparing it against zgemv + * with param trans is R + */ +CTEST(zgbmv, trans_R) +{ + blasint m = 50, n = 100; + blasint inc_b = 1, inc_c = 1; + blasint kl = 20, ku = 11; + blasint lda = 50; + char trans = 'R'; + + double alpha[] = {7.0, 1.0}; + double beta[] = {1.5, -1.5}; + + double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} +#endif diff --git a/utest/test_extensions/test_zgeadd.c b/utest/test_extensions/test_zgeadd.c new file mode 100644 index 000000000..917c04829 --- /dev/null +++ b/utest/test_extensions/test_zgeadd.c @@ -0,0 +1,880 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 + +struct DATA_ZGEADD { + double a_test[M * N * 2]; + double c_test[M * N * 2]; + double c_verify[M * N * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZGEADD data_zgeadd; + +/** + * zgeadd reference implementation + * + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param aptr - refer to matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param cptr - refer to matrix C + * param ldc - leading dimension of C + */ +static void zgeadd_trusted(blasint m, blasint n, double *alpha, double *aptr, + blasint lda, double *beta, double *cptr, blasint ldc) +{ + blasint i; + + lda *= 2; + ldc *= 2; + + for (i = 0; i < n; i++) + { + cblas_zaxpby(m, alpha, aptr, 1, beta, cptr, 1); + aptr += lda; + cptr += ldc; + } +} + +/** + * Test zgeadd by comparing it against reference + * Compare with the following options: + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param alpha - scaling factor for matrix A + * param lda - leading dimension of A + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static double check_zgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, double *alpha, blasint lda, + double *beta, blasint ldc) +{ + blasint i; + blasint cols = m, rows = n; + + if (order == CblasRowMajor) + { + rows = m; + cols = n; + } + + // Fill matrix A, C + srand_generate(data_zgeadd.a_test, lda * rows * 2); + srand_generate(data_zgeadd.c_test, ldc * rows * 2); + + // Copy matrix C for zgeadd + for (i = 0; i < ldc * rows * 2; i++) + data_zgeadd.c_verify[i] = data_zgeadd.c_test[i]; + + zgeadd_trusted(cols, rows, alpha, data_zgeadd.a_test, lda, + beta, data_zgeadd.c_verify, ldc); + + if (api == 'F') + BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda, + beta, data_zgeadd.c_test, &ldc); + else + cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda, + beta, data_zgeadd.c_test, ldc); + + // Find the differences between output matrix caculated by zgeadd and sgemm + return dmatrix_difference(data_zgeadd.c_test, data_zgeadd.c_verify, cols, rows, ldc * 2); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param api - specifies Fortran or C API + * param order - specifies whether A and C stored in + * row-major order or column-major order + * param m - number of rows of A and C + * param n - number of columns of A and C + * param lda - leading dimension of A + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in zgeadd + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order, + blasint m, blasint n, blasint lda, + blasint ldc, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + set_xerbla("ZGEADD ", expected_info); + + if (api == 'F') + BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda, + beta, data_zgeadd.c_test, &ldc); + else + cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda, + beta, data_zgeadd.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(zgeadd, matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {3.0, 2.0}; + double beta[] = {1.0, 3.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(zgeadd, matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {0.0, 0.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(zgeadd, matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {3.0, 1.5}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(zgeadd, matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(zgeadd, matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n - + * number of columns of A and C + * Must be at least zero. + */ +CTEST(zgeadd, xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = m; + blasint ldc = m; + + int expected_info = 2; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + */ +CTEST(zgeadd, xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + */ +CTEST(zgeadd, xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 6; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + */ +CTEST(zgeadd, xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Check if n - number of columns of A, C equal zero. + */ +CTEST(zgeadd, n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Check if m - number of rows of A and C equal zero. + */ +CTEST(zgeadd, m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(zgeadd, c_api_matrix_n_100_m_100) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 3.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + */ +CTEST(zgeadd, c_api_matrix_n_100_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {4.0, 1.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is row-major order + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(zgeadd, c_api_matrix_n_50_m_100_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = N / 2; + blasint m = M; + + blasint lda = n; + blasint ldc = n; + + double alpha[] = {3.0, 2.5}; + double beta[] = {1.0, 2.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar alpha is zero (operation is C:=beta*C) + */ +CTEST(zgeadd, c_api_matrix_n_100_m_100_alpha_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {0.0, 0.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalar beta is zero (operation is C:=alpha*A) + */ +CTEST(zgeadd, c_api_matrix_n_100_m_100_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {3.0, 1.5}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * c api option order is column-major order + * For A number of rows is 100, number of colums is 100 + * For C number of rows is 100, number of colums is 100 + * Scalars alpha, beta is zero (operation is C:= 0) + */ +CTEST(zgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {0.0, 0.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgeadd by comparing it against reference + * with the following options: + * + * For A number of rows is 50, number of colums is 100 + * For C number of rows is 50, number of colums is 100 + */ +CTEST(zgeadd, c_api_matrix_n_100_m_50) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = N; + blasint m = M / 2; + + blasint lda = m; + blasint ldc = m; + + double alpha[] = {2.0, 3.0}; + double beta[] = {2.0, 4.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test error function for an invalid param order - + * specifies whether A and C stored in + * row-major order or column-major order + */ +CTEST(zgeadd, c_api_xerbla_invalid_order) +{ + CBLAS_ORDER order = INVALID; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 0; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_xerbla_n_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n - + * number of columns of A and C. + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(zgeadd, c_api_xerbla_n_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = INVALID; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_xerbla_m_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 1; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m - + * number of rows of A and C + * Must be at least zero. + * + * c api option order is row-major order + */ +CTEST(zgeadd, c_api_xerbla_m_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = INVALID; + + blasint lda = 1; + blasint ldc = 1; + + int expected_info = 2; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_xerbla_lda_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda - + * specifies the leading dimension of A. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(zgeadd, c_api_xerbla_lda_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = INVALID; + blasint ldc = 1; + + int expected_info = 5; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_xerbla_ldc_invalid) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param ldc - + * specifies the leading dimension of C. Must be at least MAX(1, m). + * + * c api option order is row-major order + */ +CTEST(zgeadd, c_api_xerbla_ldc_invalid_row_major) +{ + CBLAS_ORDER order = CblasRowMajor; + + blasint n = 1; + blasint m = 1; + + blasint lda = 1; + blasint ldc = INVALID; + + int expected_info = 8; + + int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Check if n - number of columns of A, C equal zero. + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_n_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 0; + blasint m = 1; + + blasint lda = 1; + blasint ldc = 1; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Check if m - number of rows of A and C equal zero. + * + * c api option order is column-major order + */ +CTEST(zgeadd, c_api_m_zero) +{ + CBLAS_ORDER order = CblasColMajor; + + blasint n = 1; + blasint m = 0; + + blasint lda = 1; + blasint ldc = 1; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgeadd('C', order, m, n, alpha, + lda, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zgemm.c b/utest/test_extensions/test_zgemm.c new file mode 100644 index 000000000..4160a5086 --- /dev/null +++ b/utest/test_extensions/test_zgemm.c @@ -0,0 +1,273 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZGEMM { + double a_test[DATASIZE * DATASIZE * 2]; + double a_verify[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * DATASIZE * 2]; + double b_verify[DATASIZE * DATASIZE * 2]; + double c_test[DATASIZE * DATASIZE * 2]; + double c_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZGEMM data_zgemm; + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * + * param transa specifies op(A), the transposition (conjugation) operation applied to A + * param transb specifies op(B), the transposition (conjugation) operation applied to B + * param m specifies the number of rows of the matrix op(A) and of the matrix C + * param n specifies the number of columns of the matrix op(B) and the number of columns of the matrix C + * param k specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of matrix A + * param ldb - leading dimension of matrix B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of matrix C + * return norm of difference + */ +static double check_zgemm(char transa, char transb, blasint m, blasint n, blasint k, + double *alpha, blasint lda, blasint ldb, double *beta, blasint ldc) +{ + blasint i; + double alpha_conj[] = {1.0, 0.0}; + char transa_verify = transa; + char transb_verify = transb; + + int arows = k, acols = m; + int brows = n, bcols = k; + + if (transa == 'T' || transa == 'C'){ + arows = m; acols = k; + } + + if (transb == 'T' || transb == 'C'){ + brows = k; bcols = n; + } + + drand_generate(data_zgemm.a_test, arows * lda * 2); + drand_generate(data_zgemm.b_test, brows * ldb * 2); + drand_generate(data_zgemm.c_test, n * ldc * 2); + + for (i = 0; i < arows * lda * 2; i++) + data_zgemm.a_verify[i] = data_zgemm.a_test[i]; + + for (i = 0; i < brows * ldb * 2; i++) + data_zgemm.b_verify[i] = data_zgemm.b_test[i]; + + for (i = 0; i < n * ldc * 2; i++) + data_zgemm.c_verify[i] = data_zgemm.c_test[i]; + + if (transa == 'R'){ + cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_zgemm.a_verify, lda, lda); + transa_verify = 'N'; + } + + if (transb == 'R'){ + cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_zgemm.b_verify, ldb, ldb); + transb_verify = 'N'; + } + + BLASFUNC(zgemm)(&transa_verify, &transb_verify, &m, &n, &k, alpha, data_zgemm.a_verify, &lda, + data_zgemm.b_verify, &ldb, beta, data_zgemm.c_verify, &ldc); + + BLASFUNC(zgemm)(&transa, &transb, &m, &n, &k, alpha, data_zgemm.a_test, &lda, + data_zgemm.b_test, &ldb, beta, data_zgemm.c_test, &ldc); + + return dmatrix_difference(data_zgemm.c_test, data_zgemm.c_verify, m, n, ldc*2); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is conjugate and transposed + * matrix B is conjugate and not transposed + */ +CTEST(zgemm, conjtransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'C'; + char transb = 'R'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is not conjugate and not transposed + * matrix B is conjugate and not transposed + */ +CTEST(zgemm, notransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'N'; + char transb = 'R'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is conjugate and transposed + */ +CTEST(zgemm, conjnotransa_conjtransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'C'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is not conjugate and not transposed + */ +CTEST(zgemm, conjnotransa_notransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'N'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is conjugate and not transposed + */ +CTEST(zgemm, conjnotransa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'R'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is conjugate and not transposed + * matrix B is transposed + */ +CTEST(zgemm, conjnotransa_transb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'R'; + char transb = 'T'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test zgemm with the conjugate matrices by conjugating and not transposed matrices + * and comparing it with the non-conjugate zgemm. + * Test with the following options: + * + * matrix A is transposed + * matrix B is conjugate and not transposed + */ +CTEST(zgemm, transa_conjnotransb) +{ + blasint n = DATASIZE, m = DATASIZE, k = DATASIZE; + blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE; + char transa = 'T'; + char transb = 'R'; + double alpha[] = {-2.0, 1.0}; + double beta[] = {1.0, -1.0}; + + double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zgemmt.c b/utest/test_extensions/test_zgemmt.c new file mode 100644 index 000000000..c55381008 --- /dev/null +++ b/utest/test_extensions/test_zgemmt.c @@ -0,0 +1,2010 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_ZGEMMT { + double a_test[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * DATASIZE * 2]; + double c_test[DATASIZE * DATASIZE * 2]; + double c_verify[DATASIZE * DATASIZE * 2]; + double c_gemm[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZGEMMT data_zgemmt; + +/** + * Compute gemmt via gemm since gemmt is gemm but updates only + * the upper or lower triangular part of the result matrix + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + */ +static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, double *alpha, blasint lda, + blasint ldb, double *beta, blasint ldc) +{ + blasint i, j; + + if(api == 'F') + BLASFUNC(zgemm)(&transa, &transb, &m, &m, &k, alpha, data_zgemmt.a_test, &lda, + data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_gemm, &ldc); + else + cblas_zgemm(order, transa, transb, m, m, k, alpha, data_zgemmt.a_test, lda, + data_zgemmt.b_test, ldb, beta, data_zgemmt.c_gemm, ldc); + + ldc *= 2; + + if (uplo == 'L' || uplo == CblasLower) + { + for (i = 0; i < m; i++) + for (j = i * 2; j < m * 2; j+=2){ + data_zgemmt.c_verify[i * ldc + j] = + data_zgemmt.c_gemm[i * ldc + j]; + data_zgemmt.c_verify[i * ldc + j + 1] = + data_zgemmt.c_gemm[i * ldc + j + 1]; + } + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i * 2; j+=2){ + data_zgemmt.c_verify[i * ldc + j] = + data_zgemmt.c_gemm[i * ldc + j]; + data_zgemmt.c_verify[i * ldc + j + 1] = + data_zgemmt.c_gemm[i * ldc + j + 1]; + } + } +} + +/** + * Comapare results computed by zgemmt and zgemmt_trusted + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order (for Fortran API column major always) + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param alpha - scaling factor for the matrix-matrix product + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param beta - scaling factor for matrix C + * param ldc - leading dimension of C + * return norm of differences + */ +static double check_zgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, double *alpha, blasint lda, + blasint ldb, double *beta, blasint ldc) +{ + blasint i; + blasint b_cols; + blasint a_cols; + blasint inc = 1; + blasint size_c = m * ldc * 2; + + if(order == CblasColMajor){ + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = m; + else a_cols = k; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = k; + else b_cols = m; + } else { + if (transa == 'T' || transa == 'C' || + transa == CblasTrans || transa == CblasConjTrans) + a_cols = k; + else a_cols = m; + + if (transb == 'T' || transb == 'C' || + transb == CblasTrans || transb == CblasConjTrans) + b_cols = m; + else b_cols = k; + } + + drand_generate(data_zgemmt.a_test, a_cols * lda * 2); + drand_generate(data_zgemmt.b_test, b_cols * ldb * 2); + drand_generate(data_zgemmt.c_test, m * ldc * 2); + + for (i = 0; i < m * ldc * 2; i++) + data_zgemmt.c_gemm[i] = data_zgemmt.c_verify[i] = data_zgemmt.c_test[i]; + + zgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc); + + if (api == 'F') + BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test, + &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc); + else + cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda, + data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc); + + for (i = 0; i < m * ldc * 2; i++) + data_zgemmt.c_verify[i] -= data_zgemmt.c_test[i]; + + return BLASFUNC(dnrm2)(&size_c, data_zgemmt.c_verify, &inc) / size_c; +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether C’s data is stored in its upper or lower triangle + * param transa specifies op(A), the transposition operation applied to A + * param transb specifies op(B), the transposition operation applied to B + * param m - number of rows of op(A), columns of op(B), and columns and rows of C + * param k - number of columns of op(A) and rows of op(B) + * param lda - leading dimension of A + * param ldb - leading dimension of B + * param ldc - leading dimension of C + * param expected_info - expected invalid parameter number in zgemmt + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, + char transb, blasint m, blasint k, blasint lda, blasint ldb, + blasint ldc, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + set_xerbla("ZGEMMT ", expected_info); + + if (api == 'F') + BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test, + &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc); + else + cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda, + data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc); + + return check_error(); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + char transa = 'N', transb = 'T'; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'U'; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + char transa = 'R', transb = 'R'; + char uplo = 'U'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'R'; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'R', transb = 'C'; + char uplo = 'U'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, upper_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'C'; + char uplo = 'U'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + char transa = 'N', transb = 'T'; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'T'; + char uplo = 'L'; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + char transa = 'R', transb = 'R'; + char uplo = 'L'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'R'; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'R', transb = 'C'; + char uplo = 'L'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'C', transb = 'C'; + char uplo = 'L'; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_colmajor_upper_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, c_api_colmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, c_api_colmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 25, ldb = 25, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 25, ldc = 25; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_colmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, c_api_colmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Column Major + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, c_api_colmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 50, ldc = 25; + double alpha[] = {1.0, 1.0}; + double beta[] = {-1.0, -1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 25, ldc = 25; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjtrans_b_conjtrans) +{ + blasint M = 25, K = 50; + blasint lda = 25, ldb = 50, ldc = 25; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, c_api_rowmajor_upper_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its upper triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, c_api_rowmajor_upper_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B not transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B not transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_25_a_trans_b_notrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A not transposed + * B transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_25_K_50_a_notrans_b_trans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 50, ldc = 25; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A transposed + * B transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.5, 0.5}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans) +{ + blasint M = 25, K = 50; + blasint lda = 50, ldb = 25, ldc = 25; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate not transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate not transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * A conjugate transposed + * B conjugate transposed + */ +CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans) +{ + blasint M = 50, K = 25; + blasint lda = 50, ldb = 25, ldc = 50; + double alpha[] = {2.0, 1.0}; + double beta[] = {1.5, 2.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * alpha_r = 0.0, alpha_i = 0.0 + */ +CTEST(zgemmt, c_api_rowmajor_lower_alpha_zero) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {0.0, 0.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zgemmt by comparing it against sgemm + * with the following options: + * + * Row Major + * C’s data is stored in its lower triangle part + * beta_r = 1.0, beta_i = 0.0 + */ +CTEST(zgemmt, c_api_rowmajor_lower_beta_one) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans, + M, K, alpha, lda, ldb, beta, ldc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test error function for an invalid param uplo. + * Must be upper (U) or lower (L). + */ +CTEST(zgemmt, xerbla_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transa. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(zgemmt, xerbla_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'O', transb = 'N'; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param transb. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(zgemmt, xerbla_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'O'; + char uplo = 'U'; + int expected_info = 3; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(zgemmt, xerbla_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 4; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(zgemmt, xerbla_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 5; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(zgemmt, xerbla_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 8; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(zgemmt, xerbla_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'N', transb = 'N'; + char uplo = 'U'; + int expected_info = 10; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(zgemmt, xerbla_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + char transa = 'T', transb = 'N'; + char uplo = 'U'; + int expected_info = 13; + + int passed = check_badargs('F', CblasColMajor, uplo, transa, transb, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. + * Test error function for an invalid param order. + * Must be column or row major. + */ +CTEST(zgemmt, xerbla_c_api_major_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 0; + + int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 100, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B not transposed. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Column Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(zgemmt, xerbla_c_api_colmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param uplo. + * Must be upper or lower. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_uplo_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 1; + + int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transa. + * Must be trans or no-trans. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_transa_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 2; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param transb. + * Must be trans or no-trans. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_transb_invalid) +{ + blasint M = 50, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 3; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O', + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param M. + * Must be positive. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_m_invalid) +{ + blasint M = -1, K = 50; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 4; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param K. + * Must be positive. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_k_invalid) +{ + blasint M = 50, K = -1; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 5; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param lda. + * Must be must be at least K if matrix A transposed. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_lda_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 8; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldb. + * Must be must be at least K if matrix B transposed. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_ldb_invalid) +{ + blasint M = 50, K = 100; + blasint lda = 50, ldb = 50, ldc = 50; + int expected_info = 10; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test. Row Major + * Test error function for an invalid param ldc. + * Must be must be at least M. + */ +CTEST(zgemmt, xerbla_c_api_rowmajor_ldc_invalid) +{ + blasint M = 100, K = 50; + blasint lda = 100, ldb = 100, ldc = 50; + int expected_info = 13; + + int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans, + M, K, lda, ldb, ldc, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zgemv_n.c b/utest/test_extensions/test_zgemv_n.c new file mode 100644 index 000000000..903b855e1 --- /dev/null +++ b/utest/test_extensions/test_zgemv_n.c @@ -0,0 +1,341 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZSPMV_N { + double a_test[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * 2 * INCREMENT]; + double c_test[DATASIZE * 2 * INCREMENT]; + double c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX16 + +static struct DATA_ZSPMV_N data_zgemv_n; + +/** + * zgemv not transposed reference code + * + * param trans specifies whether matris A is conj or/and xconj + * param m - number of rows of A + * param n - number of columns of A + * param alpha - scaling factor for the matrib-vector product + * param a - buffer holding input matrib A + * param lda - leading dimension of matrix A + * param b - Buffer holding input vector b + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param c - buffer holding input/output vector c + * param inc_c - stride of vector c + */ +static void zgemv_n_trusted(char trans, blasint m, blasint n, double *alpha, double *a, + blasint lda, double *b, blasint inc_b, double *beta, double *c, + blasint inc_c) +{ + blasint i, j; + blasint i2 = 0; + blasint ib = 0, ic = 0; + + double temp_r, temp_i; + + double *a_ptr = a; + blasint lda2 = 2*lda; + + blasint inc_b2 = 2 * inc_b; + blasint inc_c2 = 2 * inc_c; + + BLASFUNC(zscal)(&m, beta, c, &inc_c); + + for (j = 0; j < n; j++) + { + + if (trans == 'N' || trans == 'R') { + temp_r = alpha[0] * b[ib] - alpha[1] * b[ib+1]; + temp_i = alpha[0] * b[ib+1] + alpha[1] * b[ib]; + } else { + temp_r = alpha[0] * b[ib] + alpha[1] * b[ib+1]; + temp_i = alpha[0] * b[ib+1] - alpha[1] * b[ib]; + } + + ic = 0; + i2 = 0; + + for (i = 0; i < m; i++) + { + if (trans == 'N') { + c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; + c[ic+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; + } + if (trans == 'O') { + c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; + c[ic+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; + } + if (trans == 'R') { + c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1]; + c[ic+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2]; + } + if (trans == 'S') { + c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1]; + c[ic+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2]; + } + i2 += 2; + ic += inc_c2; + } + a_ptr += lda2; + ib += inc_b2; + } + +} + +/** + * Comapare results computed by zgemv and zgemv_n_trusted + * + * param trans specifies whether matris A is conj or/and xconj + * param m - number of rows of A + * param n - number of columns of A + * param alpha - scaling factor for the matrib-vector product + * param lda - leading dimension of matrix A + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static double check_zgemv_n(char trans, blasint m, blasint n, double *alpha, blasint lda, + blasint inc_b, double *beta, blasint inc_c) +{ + blasint i; + + drand_generate(data_zgemv_n.a_test, n * lda); + drand_generate(data_zgemv_n.b_test, 2 * n * inc_b); + drand_generate(data_zgemv_n.c_test, 2 * m * inc_c); + + for (i = 0; i < m * 2 * inc_c; i++) + data_zgemv_n.c_verify[i] = data_zgemv_n.c_test[i]; + + zgemv_n_trusted(trans, m, n, alpha, data_zgemv_n.a_test, lda, data_zgemv_n.b_test, + inc_b, beta, data_zgemv_n.c_test, inc_c); + BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgemv_n.a_test, &lda, data_zgemv_n.b_test, + &inc_b, beta, data_zgemv_n.c_verify, &inc_c); + + for (i = 0; i < m * 2 * inc_c; i++) + data_zgemv_n.c_verify[i] -= data_zgemv_n.c_test[i]; + + return BLASFUNC(dznrm2)(&n, data_zgemv_n.c_verify, &inc_c); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_o_square_matrix) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows of A is 50 + * Number of colums of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_o_rectangular_matrix_rows_less_then_cols) +{ + blasint n = 100, m = 50, lda = 50; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows of A is 100 + * Number of colums of A is 50 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_o_rectangular_matrix_cols_less_then_rows) +{ + blasint n = 50, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'O'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(zgemv, trans_o_double_strides) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 2, inc_c = 2; + char trans = 'O'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_s_square_matrix) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + double alpha[] = {1.0, 1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows of A is 50 + * Number of colums of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_s_rectangular_matrix_rows_less_then_cols) +{ + blasint n = 100, m = 50, lda = 50; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows of A is 100 + * Number of colums of A is 50 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zgemv, trans_s_rectangular_matrix_cols_less_then_rows) +{ + blasint n = 50, m = 100, lda = 100; + blasint inc_b = 1, inc_c = 1; + char trans = 'S'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.4, 0.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zgemv by comparing it against reference + * with the following options: + * + * A is xconj and conj + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(zgemv, trans_s_double_strides) +{ + blasint n = 100, m = 100, lda = 100; + blasint inc_b = 2, inc_c = 2; + char trans = 'S'; + double alpha[] = {2.0, -1.0}; + double beta[] = {1.0, 5.0}; + + double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +#endif diff --git a/utest/test_extensions/test_zgemv_t.c b/utest/test_extensions/test_zgemv_t.c new file mode 100644 index 000000000..2e0ee65f0 --- /dev/null +++ b/utest/test_extensions/test_zgemv_t.c @@ -0,0 +1,1136 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define N 100 +#define M 100 +#define INCREMENT 2 + +struct DATA_ZGEMV_T { + double a_test[N * M * 2]; + double a_verify[N * M * 2]; + double y_test[M * INCREMENT * 2]; + double y_verify[M * INCREMENT * 2]; + double x_test[M * INCREMENT * 2]; + double x_verify[M * INCREMENT * 2]; +}; + +// DOUBLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * DBL_EPSILON +// DOUBLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 2.2e-16 = 1e-11 +#define DOUBLE_EPS_ZGEMV 1e-11 + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZGEMV_T data_zgemv_t; + +/** + * Find product of matrix-vector multiplication + * + * param n specifies number of columns of A + * param m specifies number of rows of A and size of vector x + * param lda specifies leading dimension of A + * param inc_x specifies increment of vector x + */ +static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x) +{ + blasint i; + double *a_ptr = data_zgemv_t.a_verify; + double *x_ptr = data_zgemv_t.x_test; + double *x_res = data_zgemv_t.x_verify; + + openblas_complex_double result; + + for (i = 0; i < n * inc_x; i += inc_x) + { + result = cblas_zdotu(lda, a_ptr, 1, x_ptr, inc_x); + x_res[0] = CREAL(result); + x_res[1] = CIMAG(result); + a_ptr += lda * 2; + x_res += 2 * inc_x; + } +} + +/** + * Test zgemv by comparing it against zomatcopy, zaxpby and + * reference func matrix_vector_product + * + * zomatcopy perform operation: op(A) + * matrix_vector_product perform operation: A*x + * zaxpby perform operation: alpha*x + beta*y + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param alpha specifies scalar alpha + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param beta specifies scalar beta + * param inc_y specifies increment for vector y + * return norm of difference between zgemv and result of reference funcs + */ +static double check_zgemv(char api, char order, char trans, blasint m, blasint n, double *alpha, + blasint lda, blasint inc_x, double *beta, blasint inc_y) +{ + blasint i; + + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + + // Transpose parameters for zomatcopy + // zgemv_t perform operation on transposed matrix, no need to transpose a_verify + char trans_copy; + char ctrans_copy; + + // Param alpha for zomatcopy, scale on alpha perform zaxpby + double alpha_one[] = {1.0, 0.0}; + + memset(data_zgemv_t.x_verify, 0.0, m * inc_x * 2 * sizeof(double)); + + // Fill matrix A, vectors x, y + drand_generate(data_zgemv_t.a_test, lda * n * 2); + drand_generate(data_zgemv_t.x_test, m * inc_x * 2); + drand_generate(data_zgemv_t.y_test, m * inc_y * 2); + + // Copy vector y for reference funcs + for (int i = 0; i < m * inc_y * 2; i++) + { + data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i]; + } + + if (api == 'F') { + if (trans == 'T') trans_copy = 'N'; + if (trans == 'C') trans_copy = 'R'; + if (trans == 'U') trans_copy = 'R'; + if (trans == 'D') trans_copy = 'N'; + + // Perform operation: op(A) + BLASFUNC(zomatcopy)(&order, &trans_copy, &m, &n, alpha_one, + data_zgemv_t.a_test, &lda, data_zgemv_t.a_verify, &lda); + + // Find A*x + matrix_vector_product(n, m, lda, inc_x); + + // Find conj(x) + if (trans == 'U' || trans == 'D') + { + zconjugate_vector(m, inc_x, data_zgemv_t.x_verify); + } + + // Find alpha*x+beta*y + BLASFUNC(zaxpby)(&n, alpha, data_zgemv_t.x_verify, &inc_x, beta, + data_zgemv_t.y_verify, &inc_y); + + BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgemv_t.a_test, &lda, + data_zgemv_t.x_test, &inc_x, beta, data_zgemv_t.y_test, &inc_y); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') {ctrans = CblasTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasTrans : CblasNoTrans;} + if (trans == 'N') {ctrans = CblasNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasNoTrans : CblasTrans;} + if (trans == 'C') {ctrans = CblasConjTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjTrans : CblasConjNoTrans;} + if (trans == 'R') {ctrans = CblasConjNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjNoTrans : CblasConjTrans;} + + // Perform operation: op(A) + cblas_zomatcopy(corder, ctrans_copy, m, n, alpha_one, data_zgemv_t.a_test, lda, data_zgemv_t.a_verify, lda); + + // Find A*x + matrix_vector_product(n, m, lda, inc_x); + + // Find alpha*x+beta*y + cblas_zaxpby(n, alpha, data_zgemv_t.x_verify, inc_x, beta, data_zgemv_t.y_verify, inc_y); + + cblas_zgemv(corder, ctrans, m, n, alpha, data_zgemv_t.a_test, + lda, data_zgemv_t.x_test, inc_x, beta, data_zgemv_t.y_test, inc_y); + } + + // Find the differences between output vector caculated by zgemv and reference funcs + for (i = 0; i < m * inc_y * 2; i++) + data_zgemv_t.y_test[i] -= data_zgemv_t.y_verify[i]; + + // Find the norm of differences + return cblas_dznrm2(m, data_zgemv_t.y_test, inc_y); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param inc_y specifies increment for vector y + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint m, blasint n, + blasint lda, blasint inc_x, blasint inc_y, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double a[] = {1.0, 1.0}; + double x[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + double y[] = {1.0, 1.0}; + + set_xerbla("ZGEMV ", expected_info); + + BLASFUNC(zgemv)(&trans, &m, &n, alpha, a, &lda, x, + &inc_x, beta, y, &inc_y); + + return check_error(); +} + +/** + * C API specific function + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param m specifies number of rows of A + * param n specifies number of columns of A + * param lda specifies leading dimension of the matrix A + * param inc_x specifies increment for vector x + * param inc_y specifies increment for vector y + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int c_api_check_badargs(CBLAS_ORDER corder, CBLAS_TRANSPOSE ctrans, blasint m, blasint n, + blasint lda, blasint inc_x, blasint inc_y, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double a[] = {1.0, 1.0}; + double x[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + double y[] = {1.0, 1.0}; + + set_xerbla("ZGEMV ", expected_info); + + cblas_zgemv(corder, ctrans, m, n, alpha, a, lda, x, inc_x, beta, y, inc_y); + + return check_error(); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 2.0, beta_i = 2.0 + */ +CTEST(zgemv, colmajor_trans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 2.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 2.0, beta_i = 2.0 + */ +CTEST(zgemv, colmajor_trans_col_100_row_100_inc_x_2_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {2.0, 2.0}; + + blasint inc_x = 2; + blasint inc_y = 1; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 2.0, beta_i = 1.0 + */ +CTEST(zgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {2.0, 1.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 2 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 2.0, beta_i = 1.0 + */ +CTEST(zgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + blasint inc_x = 1; + blasint inc_y = 2; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and x conjugate + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 2.0, beta_i = 1.0 + */ +CTEST(zgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and x conjugate + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 1.0, alpha_i = 2.0 + * beta_r = 1.0, beta_i = 1.0 + */ +CTEST(zgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_2_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'U'; + + double alpha[] = {1.0, 2.0}; + double beta[] = {1.0, 1.0}; + + blasint inc_x = 2; + blasint inc_y = 2; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition, conjugate A, conjugate x + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 2.0 + */ +CTEST(zgemv, colmajor_conjtrans_x_conj_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'D'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 2.0}; + + blasint inc_x = 1; + blasint inc_y = 2; + + double norm = check_zgemv('F', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition, conjugate A, conjugate x + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 2.0 + */ +CTEST(zgemv, c_api_colmajor_trans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'T'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 2.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 2.0 + */ +CTEST(zgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 2.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate A + * Square matrix + * inc x = 1, inc y = 2 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 2.0 + */ +CTEST(zgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'C'; + char trans = 'C'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 2.0}; + + blasint inc_x = 1; + blasint inc_y = 2; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Row Major + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 2.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 1.0 + */ +CTEST(zgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'N'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {1.0, 1.0}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Row Major + * No trans + * Square matrix + * inc x = 2, inc y = 2 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 3.0, beta_i = 2.0 + */ +CTEST(zgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_2_y_2) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'N'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {3.0, 1.0}; + + blasint inc_x = 2; + blasint inc_y = 2; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Column Major + * Conjugate + * Square matrix + * inc x = 1, inc y = 1 + * alpha_r = 1.0, alpha_i = 3.0 + * beta_r = 1.0, beta_i = 2.5 + */ +CTEST(zgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_1_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'R'; + + double alpha[] = {1.0, 3.0}; + double beta[] = {1.0, 2.5}; + + blasint inc_x = 1; + blasint inc_y = 1; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * C API specific test + * Test zgemv by comparing it against reference + * with the following options: + * + * Row Major + * Conjugate + * Square matrix + * inc x = 2, inc y = 1 + * alpha_r = 1.0, alpha_i = 1.0 + * beta_r = 1.0, beta_i = 1.5 + */ +CTEST(zgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_2_y_1) +{ + blasint m = 100, n = 100; + blasint lda = 100; + char order = 'R'; + char trans = 'R'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.5}; + + blasint inc_x = 2; + blasint inc_y = 1; + + double norm = check_zgemv('C', order, trans, m, n, alpha, lda, + inc_x, beta, inc_y); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Fortran API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_inc_y) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_inc_y_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_y. + * Must be positive + * + * Row major + */ +CTEST(zgemv, c_api_xerbla_invalid_inc_y_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 0; + + int expected_info = 11; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_inc_x) +{ + char order = 'C'; + char trans = 'T'; + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_inc_x_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param inc_x. + * Must be positive + * + * Row major + */ +CTEST(zgemv, c_api_xerbla_invalid_inc_x_row_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 0; + blasint inc_y = 1; + + int expected_info = 8; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_n) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_n_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param n. + * Must be positive. + * + * Row major + */ +CTEST(zgemv, c_api_xerbla_invalid_n_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 3; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_m) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_m_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = INVALID, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param m. + * Must be positive. + * + * Row major + */ +CTEST(zgemv, c_api_xerbla_invalid_m_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = INVALID; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 2; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param lda. + * lda must be at least n. + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_lda) +{ + char order = 'C'; + char trans = 'T'; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda. + * If matrices are stored using col major layout, + * lda must be at least m. + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_lda_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param lda. + * If matrices are stored using col major layout, + * lda must be at least n. + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_lda_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = CblasNoTrans; + + blasint m = 1, n = 1; + blasint lda = INVALID; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 6; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Fortran API specific test + * Test error function for an invalid param trans. + * + * Column major + */ +CTEST(zgemv, xerbla_invalid_trans) +{ + char order = 'C'; + char trans = 'Z'; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param trans. + * + * Column major + */ +CTEST(zgemv, c_api_xerbla_invalid_trans_col_major) +{ + enum CBLAS_ORDER corder = CblasColMajor; + enum CBLAS_TRANSPOSE ctrans = INVALID; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param trans. + * + * Row major + */ +CTEST(zgemv, c_api_xerbla_invalid_trans_row_major) +{ + enum CBLAS_ORDER corder = CblasRowMajor; + enum CBLAS_TRANSPOSE ctrans = INVALID; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 1; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * C API specific test + * Test error function for an invalid param order. + */ +CTEST(zgemv, c_api_xerbla_invalid_order_col_major) +{ + enum CBLAS_ORDER corder = INVALID; + enum CBLAS_TRANSPOSE ctrans = CblasTrans; + + blasint m = 1, n = 1; + blasint lda = 1; + + blasint inc_x = 1; + blasint inc_y = 1; + + int expected_info = 0; + + int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zimatcopy.c b/utest/test_extensions/test_zimatcopy.c new file mode 100644 index 000000000..6461ce88f --- /dev/null +++ b/utest/test_extensions/test_zimatcopy.c @@ -0,0 +1,850 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_ZIMATCOPY { + double a_test[DATASIZE * DATASIZE * 2]; + double a_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZIMATCOPY data_zimatcopy; + +/** + * Comapare results computed by zimatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param alpha specifies scaling factor for matrix A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * return norm of difference between openblas and reference func + */ +static double check_zimatcopy(char api, char order, char trans, blasint rows, blasint cols, double *alpha, + blasint lda_src, blasint lda_dst) +{ + blasint m, n; + blasint rows_out, cols_out; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + int conj = -1; + + if (order == 'C') { + n = rows; m = cols; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + rows_out = n; cols_out = m*2; + if (trans == 'C') + conj = 1; + } + else { + rows_out = m; cols_out = n*2; + if (trans == 'R') + conj = 1; + } + + drand_generate(data_zimatcopy.a_test, lda_src*m*2); + + if (trans == 'T' || trans == 'C') { + ztranspose(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); + } + else { + zcopy(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); + } + + if (api == 'F') { + BLASFUNC(zimatcopy)(&order, &trans, &rows, &cols, alpha, data_zimatcopy.a_test, + &lda_src, &lda_dst); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_zimatcopy(corder, ctrans, rows, cols, alpha, data_zimatcopy.a_test, + lda_src, lda_dst); + } + + // Find the differences between output matrix computed by zimatcopy and reference func + return dmatrix_difference(data_zimatcopy.a_test, data_zimatcopy.a_verify, cols_out, rows_out, lda_dst*2); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows specifies number of rows of A + * param cols specifies number of columns of A + * param lda_src - leading dimension of the matrix A + * param lda_dst - leading dimension of output matrix A + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda_src, blasint lda_dst, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + + set_xerbla("ZIMATCOPY", expected_info); + + BLASFUNC(zimatcopy)(&order, &trans, &rows, &cols, alpha, data_zimatcopy.a_test, + &lda_src, &lda_dst); + + return check_error(); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha_r = -3.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha[] = {-3.0, 1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, colmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'C'; + double alpha[] = {2.0, 1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, colmajor_notrans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, colmajor_conjtrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'C'; + double alpha[] = {1.0, 1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, colmajor_conj_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 50; + char order = 'C'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha_r = 2.0, alpha_i = 3.0 + */ +CTEST(zimatcopy, rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha[] = {2.0, 3.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + double alpha[] = {2.0, 1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Rectangular matrix + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + double alpha[] = {2.0, 1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Rectangular matrix + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, rowmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zimatcopy, rowmajor_conj_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 50; + char order = 'R'; + char trans = 'R'; + double alpha[] = {1.5, -1.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, rowmajor_conjtrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition + * Square matrix + * alpha_r = 3.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, c_api_colmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + double alpha[] = {3.0, 2.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy only + * Square matrix + * alpha_r = 3.0, alpha_i = 1.5 + */ +CTEST(zimatcopy, c_api_colmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'N'; + double alpha[] = {3.0, 1.5}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition + * Square matrix + * alpha_r = 3.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, c_api_rowmajor_trans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + double alpha[] = {3.0, 1.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, c_api_colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, c_api_colmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'C'; + double alpha[] = {2.0, 1.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy only + * Square matrix + * alpha_r = 1.0, alpha_i = 1.0 + */ +CTEST(zimatcopy, c_api_rowmajor_notrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'N'; + double alpha[] = {1.0, 1.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zimatcopy, c_api_rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'R'; + double alpha[] = {1.5, -1.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zimatcopy by comparing it against reference + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zimatcopy, c_api_rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'R'; + char trans = 'C'; + double alpha[] = {1.0, 2.0}; + + double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test error function for an invalid param order. + * Must be column (C) or row major (R). + */ +CTEST(zimatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param trans. + * Must be trans (T/C) or no-trans (N/R). + */ +CTEST(zimatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda_src = 100, lda_dst = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param m. + * Must be positive. + */ +CTEST(zimatcopy, xerbla_invalid_rows) +{ + blasint m = 0, n = 100; + blasint lda_src = 0, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param n. + * Must be positive. + */ +CTEST(zimatcopy, xerbla_invalid_cols) +{ + blasint m = 100, n = 0; + blasint lda_src = 100, lda_dst = 0; + char order = 'C'; + char trans = 'T'; + int expected_info = 4; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using row major layout, + * lda_src must be at least n. + */ +CTEST(zimatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda_src = 50, lda_dst = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_src. + * If matrices are stored using column major layout, + * lda_src must be at least m. + */ +CTEST(zimatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda_src = 50, lda_dst = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is no transposition, lda_dst must be at least n. + */ +CTEST(zimatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using row major layout and + * there is transposition, lda_dst must be at least m. + */ +CTEST(zimatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is no transposition, lda_dst must be at least m. + */ +CTEST(zimatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda_dst. + * If matrices are stored using column major layout and + * there is transposition, lda_dst must be at least n. + */ +CTEST(zimatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda_src = 100, lda_dst = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zomatcopy.c b/utest/test_extensions/test_zomatcopy.c new file mode 100644 index 000000000..8df3dd80f --- /dev/null +++ b/utest/test_extensions/test_zomatcopy.c @@ -0,0 +1,745 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 + +struct DATA_ZOMATCOPY { + double a_test[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * DATASIZE * 2]; + double b_verify[DATASIZE * DATASIZE * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZOMATCOPY data_zomatcopy; + +/** + * Comapare results computed by zomatcopy and reference func + * + * param api specifies tested api (C or Fortran) + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param alpha - scaling factor for matrix B + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * return norm of difference between openblas and reference func + */ +static double check_zomatcopy(char api, char order, char trans, blasint rows, blasint cols, double* alpha, + blasint lda, blasint ldb) +{ + blasint b_rows, b_cols; + blasint m, n; + enum CBLAS_ORDER corder; + enum CBLAS_TRANSPOSE ctrans; + int conj = -1; + + if (order == 'C') { + m = cols; n = rows; + } + else { + m = rows; n = cols; + } + + if(trans == 'T' || trans == 'C') { + b_rows = n; b_cols = m*2; + if (trans == 'C') + conj = 1; + } + else { + b_rows = m; b_cols = n*2; + if (trans == 'R') + conj = 1; + } + + drand_generate(data_zomatcopy.a_test, lda*m*2); + + if (trans == 'T' || trans == 'C') { + ztranspose(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); + } + else { + zcopy(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); + } + + if (api == 'F') { + BLASFUNC(zomatcopy)(&order, &trans, &rows, &cols, alpha, data_zomatcopy.a_test, + &lda, data_zomatcopy.b_test, &ldb); + } + else { + if (order == 'C') corder = CblasColMajor; + if (order == 'R') corder = CblasRowMajor; + if (trans == 'T') ctrans = CblasTrans; + if (trans == 'N') ctrans = CblasNoTrans; + if (trans == 'C') ctrans = CblasConjTrans; + if (trans == 'R') ctrans = CblasConjNoTrans; + cblas_zomatcopy(corder, ctrans, rows, cols, alpha, data_zomatcopy.a_test, + lda, data_zomatcopy.b_test, ldb); + } + + return dmatrix_difference(data_zomatcopy.b_test, data_zomatcopy.b_verify, b_cols, b_rows, ldb*2); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param order specifies row or column major order + * param trans specifies op(A), the transposition operation + * applied to the matrix A + * param rows - number of rows of A + * param cols - number of columns of A + * param lda - leading dimension of the matrix A + * param ldb - leading dimension of the matrix B + * param expected_info - expected invalid parameter number + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char order, char trans, blasint rows, blasint cols, + blasint lda, blasint ldb, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + + set_xerbla("ZOMATCOPY", expected_info); + + BLASFUNC(zomatcopy)(&order, &trans, &rows, &cols, alpha, data_zomatcopy.a_test, + &lda, data_zomatcopy.b_test, &ldb); + + return check_error(); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * alpha_r = -1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + double alpha[] = {-1.0, 2.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zomatcopy, colmajor_conjtrnas_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'C'; + double alpha[] = {2.0, 1.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'R'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'C'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy only + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, c_api_colmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'N'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition + * alpha_r = -1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, c_api_colmajor_trans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + double alpha[] = {-1.0, 2.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Copy and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, c_api_colmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'R'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Column Major + * Transposition and conjugate + * alpha_r = 2.0, alpha_i = 1.0 + */ +CTEST(zomatcopy, c_api_colmajor_conjtrnas_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'C'; + double alpha[] = {2.0, 1.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy only + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, c_api_rowmajor_notrans_col_50_row_100) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 50; + char order = 'R'; + char trans = 'N'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, c_api_rowmajor_trans_col_100_row_50) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Copy and conjugate + * alpha_r = 1.5, alpha_i = -1.0 + */ +CTEST(zomatcopy, c_api_rowmajor_conj_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'R'; + double alpha[] = {1.5, -1.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zomatcopy by comparing it against refernce + * with the following options: + * + * Row Major + * Transposition and conjugate + * alpha_r = 1.0, alpha_i = 2.0 + */ +CTEST(zomatcopy, c_api_rowmajor_conjtrans_col_100_row_100) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'R'; + char trans = 'C'; + double alpha[] = {1.0, 2.0}; + double norm; + + norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** +* Test error function for an invalid param order. +* Must be column (C) or row major (R). +*/ +CTEST(zomatcopy, xerbla_invalid_order) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'O'; + char trans = 'T'; + int expected_info = 1; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param trans. +* Must be trans (T/C) or no-trans (N/R). +*/ +CTEST(zomatcopy, xerbla_invalid_trans) +{ + blasint m = 100, n = 100; + blasint lda = 100, ldb = 100; + char order = 'C'; + char trans = 'O'; + int expected_info = 2; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param m. +* Must be positive. +*/ +CTEST(zomatcopy, xerbla_invalid_rows) +{ + blasint m = 0, n = 100; + blasint lda = 0, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 3; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param n. +* Must be positive. +*/ +CTEST(zomatcopy, xerbla_invalid_cols) +{ + blasint m = 100, n = 0; + blasint lda = 100, ldb = 0; + char order = 'C'; + char trans = 'T'; + int expected_info = 4; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param lda. +* If matrices are stored using row major layout, +* lda must be at least n. +*/ +CTEST(zomatcopy, xerbla_rowmajor_invalid_lda) +{ + blasint m = 50, n = 100; + blasint lda = 50, ldb = 100; + char order = 'R'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param lda. +* If matrices are stored using column major layout, +* lda must be at least m. +*/ +CTEST(zomatcopy, xerbla_colmajor_invalid_lda) +{ + blasint m = 100, n = 50; + blasint lda = 50, ldb = 100; + char order = 'C'; + char trans = 'T'; + int expected_info = 7; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using row major layout and +* there is no transposition, ldb must be at least n. +*/ +CTEST(zomatcopy, xerbla_rowmajor_notrans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using row major layout and +* there is transposition, ldb must be at least m. +*/ +CTEST(zomatcopy, xerbla_rowmajor_trans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using row major layout and +* there is no transposition, ldb must be at least n. +*/ +CTEST(zomatcopy, xerbla_rowmajor_conj_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'R'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using row major layout and +* there is transposition, ldb must be at least m. +*/ +CTEST(zomatcopy, xerbla_rowmajor_transconj_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'R'; + char trans = 'C'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using column major layout and +* there is no transposition, ldb must be at least m. +*/ +CTEST(zomatcopy, xerbla_colmajor_notrans_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'N'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using column major layout and +* there is transposition, ldb must be at least n. +*/ +CTEST(zomatcopy, xerbla_colmajor_trans_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'T'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using column major layout and +* there is no transposition, ldb must be at least m. +*/ +CTEST(zomatcopy, xerbla_colmajor_conj_invalid_ldb) +{ + blasint m = 100, n = 50; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'R'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** +* Test error function for an invalid param ldb. +* If matrices are stored using column major layout and +* there is transposition, ldb must be at least n. +*/ +CTEST(zomatcopy, xerbla_colmajor_transconj_invalid_ldb) +{ + blasint m = 50, n = 100; + blasint lda = 100, ldb = 50; + char order = 'C'; + char trans = 'C'; + int expected_info = 9; + + int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_zrot.c b/utest/test_extensions/test_zrot.c new file mode 100644 index 000000000..5471e051a --- /dev/null +++ b/utest/test_extensions/test_zrot.c @@ -0,0 +1,790 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZROT { + double x_test[DATASIZE * INCREMENT * 2]; + double y_test[DATASIZE * INCREMENT * 2]; + double x_verify[DATASIZE * INCREMENT * 2]; + double y_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZROT data_zrot; + +/** + * Comapare results computed by zdrot and zaxpby + * + * param n specifies size of vector x + * param inc_x specifies increment of vector x + * param inc_y specifies increment of vector y + * param c specifies cosine + * param s specifies sine + * return norm of differences + */ +static double check_zdrot(blasint n, blasint inc_x, blasint inc_y, double *c, double *s) +{ + blasint i; + double norm = 0; + double s_neg[] = {-s[0], s[1]}; + + blasint inc_x_abs = labs(inc_x); + blasint inc_y_abs = labs(inc_y); + + // Fill vectors x, y + drand_generate(data_zrot.x_test, n * inc_x_abs * 2); + drand_generate(data_zrot.y_test, n * inc_y_abs * 2); + + if (inc_x == 0 && inc_y == 0) { + drand_generate(data_zrot.x_test, n * 2); + drand_generate(data_zrot.y_test, n * 2); + } + + // Copy vector x for zaxpby + for (i = 0; i < n * inc_x_abs * 2; i++) + data_zrot.x_verify[i] = data_zrot.x_test[i]; + + // Copy vector y for zaxpby + for (i = 0; i < n * inc_y_abs * 2; i++) + data_zrot.y_verify[i] = data_zrot.y_test[i]; + + // Find cx = c*x + s*y + BLASFUNC(zaxpby)(&n, s, data_zrot.y_test, &inc_y, c, data_zrot.x_verify, &inc_x); + + // Find cy = -conjg(s)*x + c*y + BLASFUNC(zaxpby)(&n, s_neg, data_zrot.x_test, &inc_x, c, data_zrot.y_verify, &inc_y); + + BLASFUNC(zdrot)(&n, data_zrot.x_test, &inc_x, data_zrot.y_test, &inc_y, c, s); + + // Find the differences between vector x caculated by zaxpby and zdrot + for (i = 0; i < n * 2 * inc_x_abs; i++) + data_zrot.x_test[i] -= data_zrot.x_verify[i]; + + // Find the differences between vector y caculated by zaxpby and zdrot + for (i = 0; i < n * 2 * inc_y_abs; i++) + data_zrot.y_test[i] -= data_zrot.y_verify[i]; + + // Find the norm of differences + norm += BLASFUNC(dznrm2)(&n, data_zrot.x_test, &inc_x_abs); + norm += BLASFUNC(dznrm2)(&n, data_zrot.y_test, &inc_y_abs); + return (norm / 2); +} + +/** + * C API specific function + * Comapare results computed by zdrot and zaxpby + * + * param n specifies size of vector x + * param inc_x specifies increment of vector x + * param inc_y specifies increment of vector y + * param c specifies cosine + * param s specifies sine + * return norm of differences + */ +static double c_api_check_zdrot(blasint n, blasint inc_x, blasint inc_y, double *c, double *s) +{ + blasint i; + double norm = 0; + double s_neg[] = {-s[0], s[1]}; + + blasint inc_x_abs = labs(inc_x); + blasint inc_y_abs = labs(inc_y); + + // Fill vectors x, y + drand_generate(data_zrot.x_test, n * inc_x_abs * 2); + drand_generate(data_zrot.y_test, n * inc_y_abs * 2); + + if (inc_x == 0 && inc_y == 0) { + drand_generate(data_zrot.x_test, n * 2); + drand_generate(data_zrot.y_test, n * 2); + } + + // Copy vector x for zaxpby + for (i = 0; i < n * inc_x_abs * 2; i++) + data_zrot.x_verify[i] = data_zrot.x_test[i]; + + // Copy vector y for zaxpby + for (i = 0; i < n * inc_y_abs * 2; i++) + data_zrot.y_verify[i] = data_zrot.y_test[i]; + + // Find cx = c*x + s*y + cblas_zaxpby(n, s, data_zrot.y_test, inc_y, c, data_zrot.x_verify, inc_x); + + // Find cy = -conjg(s)*x + c*y + cblas_zaxpby(n, s_neg, data_zrot.x_test, inc_x, c, data_zrot.y_verify, inc_y); + + cblas_zdrot(n, data_zrot.x_test, inc_x, data_zrot.y_test, inc_y, c[0], s[0]); + + // Find the differences between vector x caculated by zaxpby and zdrot + for (i = 0; i < n * 2 * inc_x_abs; i++) + data_zrot.x_test[i] -= data_zrot.x_verify[i]; + + // Find the differences between vector y caculated by zaxpby and zdrot + for (i = 0; i < n * 2 * inc_y_abs; i++) + data_zrot.y_test[i] -= data_zrot.y_verify[i]; + + // Find the norm of differences + norm += cblas_dznrm2(n, data_zrot.x_test, inc_x_abs); + norm += cblas_dznrm2(n, data_zrot.y_test, inc_y_abs); + return (norm / 2); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 0 + * Stride of vector y is 0 + * c = 1.0 + * s = 2.0 + */ +CTEST(zrot, inc_x_0_inc_y_0) +{ + blasint n = 100; + + blasint inc_x = 0; + blasint inc_y = 0; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_1_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is -1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_neg_1_inc_y_neg_1) +{ + blasint n = 100; + + blasint inc_x = -1; + blasint inc_y = -1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * c = 3.0 + * s = 2.0 + */ +CTEST(zrot, inc_x_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {3.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_neg_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_1_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is -2 + * c = 2.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_1_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = -2; + + // Imaginary part for zaxpby + double c[] = {2.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 2.0 + */ +CTEST(zrot, inc_x_2_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_neg_2_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = -2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 0.0 + * s = 1.0 + */ +CTEST(zrot, inc_x_2_inc_y_2_c_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {0.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 0.0 + */ +CTEST(zrot, inc_x_2_inc_y_2_s_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {0.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 0 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, check_n_zero) +{ + blasint n = 0; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 0 + * Stride of vector y is 0 + * c = 1.0 + * s = 2.0 + */ +CTEST(zrot, c_api_inc_x_0_inc_y_0) +{ + blasint n = 100; + + blasint inc_x = 0; + blasint inc_y = 0; + + // Imaginary part for zaxpby + double c[] = {3.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_1_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -1 + * Stride of vector y is -1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_neg_1_inc_y_neg_1) +{ + blasint n = 100; + + blasint inc_x = -1; + blasint inc_y = -1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 1 + * c = 3.0 + * s = 2.0 + */ +CTEST(zrot, c_api_inc_x_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {3.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is -2 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_neg_2_inc_y_1) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is 2 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_1_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 1 + * Stride of vector y is -2 + * c = 2.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_1_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = 1; + blasint inc_y = -2; + + // Imaginary part for zaxpby + double c[] = {2.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 2.0 + */ +CTEST(zrot, c_api_inc_x_2_inc_y_2) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {2.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_neg_2_inc_y_neg_2) +{ + blasint n = 100; + + blasint inc_x = -2; + blasint inc_y = -2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 0.0 + * s = 1.0 + */ +CTEST(zrot, c_api_inc_x_2_inc_y_2_c_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {0.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 100 + * Stride of vector x is 2 + * Stride of vector y is 2 + * c = 1.0 + * s = 0.0 + */ +CTEST(zrot, c_api_inc_x_2_inc_y_2_s_zero) +{ + blasint n = 100; + + blasint inc_x = 2; + blasint inc_y = 2; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {0.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrot by comparing it with zaxpby. + * Test with the following options: + * + * Size of vectors x, y is 0 + * Stride of vector x is 1 + * Stride of vector y is 1 + * c = 1.0 + * s = 1.0 + */ +CTEST(zrot, c_api_check_n_zero) +{ + blasint n = 0; + + blasint inc_x = 1; + blasint inc_y = 1; + + // Imaginary part for zaxpby + double c[] = {1.0, 0.0}; + double s[] = {1.0, 0.0}; + + double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zrotg.c b/utest/test_extensions/test_zrotg.c new file mode 100644 index 000000000..310121422 --- /dev/null +++ b/utest/test_extensions/test_zrotg.c @@ -0,0 +1,290 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include + +#ifdef BUILD_COMPLEX16 + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, zero_a) +{ + double sa[2] = {0.0, 0.0}; + double sb[2] = {1.0, 1.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific tests + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, zero_b) +{ + double sa[2] = {1.0, 1.0}; + double sb[2] = {0.0, 0.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(1.0, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, zero_real) +{ + double sa[2] = {0.0, 1.0}; + double sb[2] = {0.0, 1.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.70710678118654, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70710678118654, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421356237309, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, positive_real_positive_img) +{ + double sa[2] = {3.0, 4.0}; + double sb[2] = {4.0, 6.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, negative_real_positive_img) +{ + double sa[2] = {-3.0, 4.0}; + double sb[2] = {-4.0, 6.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, positive_real_negative_img) +{ + double sa[2] = {3.0, -4.0}; + double sb[2] = {4.0, -6.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, negative_real_negative_img) +{ + double sa[2] = {-3.0, -4.0}; + double sb[2] = {-4.0, -6.0}; + double ss[2]; + double sc; + BLASFUNC(zrotg)(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_zero_a) +{ + double sa[2] = {0.0, 0.0}; + double sb[2] = {1.0, 1.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_zero_b) +{ + double sa[2] = {1.0, 1.0}; + double sb[2] = {0.0, 0.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(1.0, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_zero_real) +{ + double sa[2] = {0.0, 1.0}; + double sb[2] = {0.0, 1.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.70710678118654, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70710678118654, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421356237309, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_positive_real_positive_img) +{ + double sa[2] = {3.0, 4.0}; + double sb[2] = {4.0, 6.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_negative_real_positive_img) +{ + double sa[2] = {-3.0, 4.0}; + double sb[2] = {-4.0, 6.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_positive_real_negative_img) +{ + double sa[2] = {3.0, -4.0}; + double sb[2] = {4.0, -6.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); +} + +/** + * C API specific test + * Test zrotg by comparing it against pre-calculated values + */ +CTEST(zrotg, c_api_negative_real_negative_img) +{ + double sa[2] = {-3.0, -4.0}; + double sb[2] = {-4.0, -6.0}; + double ss[2]; + double sc; + cblas_zrotg(sa, sb, &sc, ss); + ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zsbmv.c b/utest/test_extensions/test_zsbmv.c new file mode 100644 index 000000000..afdb208c1 --- /dev/null +++ b/utest/test_extensions/test_zsbmv.c @@ -0,0 +1,606 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZSBMV { + double sp_matrix[DATASIZE * (DATASIZE + 1)]; + double sb_matrix[DATASIZE * DATASIZE * 2]; + double b_test[DATASIZE * 2 * INCREMENT]; + double c_test[DATASIZE * 2 * INCREMENT]; + double c_verify[DATASIZE * 2 * INCREMENT]; +}; + +// DOUBLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * DBL_EPSILON +// DOUBLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 2.2e-16 = 1e-11 +#define DOUBLE_EPS_ZGEMV 1e-11 + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZSBMV data_zsbmv; + +/** + * Transform full-storage symmetric band matrix A to upper (U) or lower (L) + * band-packed storage mode. + * + * param uplo specifies whether matrix a is upper or lower band-packed. + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * output param a - buffer for holding symmetric band-packed matrix + * param lda - specifies the leading dimension of a + * param sb_matrix - buffer holding full-storage symmetric band matrix A + * param ldm - specifies the leading dimension of A + */ +static void transform_to_band_storage(char uplo, blasint n, blasint k, double* a, blasint lda, + double* sb_matrix, blasint ldm) +{ + blasint i, j, m; + if (uplo == 'L') { + for (j = 0; j < n; j++) + { + m = -j; + for (i = 2 * j; i < MIN(2 * n, 2 * (j + k + 1)); i += 2) + { + a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2]; + a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2]; + } + } + } + else { + for (j = 0; j < n; j++) + { + m = k - j; + for (i = MAX(0, 2*(j - k)); i <= j*2; i += 2) + { + a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2]; + a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2]; + } + } + } +} + +/** + * Generate full-storage symmetric band matrix A with k - super-diagonals + * from input symmetric packed matrix in lower packed mode (L) + * + * output param sb_matrix - buffer for holding full-storage symmetric band matrix. + * param sp_matrix - buffer holding input symmetric packed matrix + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + */ +static void get_symmetric_band_matr(double *sb_matrix, double *sp_matrix, blasint n, blasint k) +{ + blasint m; + blasint i, j; + m = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < n * 2; j += 2) + { + // Make matrix band with k super-diagonals + if (fabs((i+1) - ceil((j+1)/2.0)) > k) + { + sb_matrix[i * n * 2 + j] = 0.0; + sb_matrix[i * n * 2 + j + 1] = 0.0; + continue; + } + + if (j / 2 < i) + { + sb_matrix[i * n * 2 + j] = + sb_matrix[j * n + i * 2]; + sb_matrix[i * n * 2 + j + 1] = + sb_matrix[j * n + i * 2 + 1]; + } + else + { + sb_matrix[i * n * 2 + j] = sp_matrix[m++]; + sb_matrix[i * n * 2 + j + 1] = sp_matrix[m++]; + } + } + } +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether matrix a is upper or lower band-packed. + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b_test + * param inc_c - stride of vector c_test + * param expected_info - expected invalid parameter number in zsbmv + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char uplo, blasint n, blasint k, blasint lda, blasint inc_b, + blasint inc_c, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double a[2]; + drand_generate(a, 2); + + set_xerbla("ZSBMV ", expected_info); + + BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, data_zsbmv.b_test, + &inc_b, beta, data_zsbmv.c_test, &inc_c); + + return check_error(); +} + +/** + * Comapare results computed by zsbmv and zgemv + * since zsbmv is zgemv for symmetric band matrix + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param k - number of super-diagonals of A + * param alpha - scaling factor for the matrix-vector product + * param lda - specifies the leading dimension of a + * param inc_b - stride of vector b_test + * param beta - scaling factor for vector c_test + * param inc_c - stride of vector c_test + * param lda - specifies the leading dimension of a + * return norm of differences + */ +static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasint lda, + blasint inc_b, double *beta, blasint inc_c, blasint ldm) +{ + blasint i; + + // Trans param for gemv (can use any, since the input matrix is symmetric) + char trans = 'N'; + + // Symmetric band packed matrix for sbmv + double a[lda * n * 2]; + + // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test + drand_generate(data_zsbmv.sp_matrix, n * (n + 1)); + drand_generate(data_zsbmv.b_test, n * inc_b * 2); + drand_generate(data_zsbmv.c_test, n * inc_c * 2); + + // Copy vector c_test for zgemv + for (i = 0; i < n * inc_c * 2; i++) + data_zsbmv.c_verify[i] = data_zsbmv.c_test[i]; + + // Generate full-storage symmetric band matrix + // with k super-diagonals from symmetric packed matrix + get_symmetric_band_matr(data_zsbmv.sb_matrix, data_zsbmv.sp_matrix, n, k); + + // Transform symmetric band matrix from conventional + // full matrix storage to band storage for zsbmv + transform_to_band_storage(uplo, n, k, a, lda, data_zsbmv.sb_matrix, ldm); + + BLASFUNC(zgemv)(&trans, &n, &n, alpha, data_zsbmv.sb_matrix, &ldm, data_zsbmv.b_test, + &inc_b, beta, data_zsbmv.c_verify, &inc_c); + + BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, + data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c); + + // Find the differences between output vector caculated by zsbmv and zgemv + for (i = 0; i < n * inc_c * 2; i++) + data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i]; + + // Find the norm of differences + return BLASFUNC(dznrm2)(&n, data_zsbmv.c_test, &inc_c); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 0 + */ +CTEST(zsbmv, upper_k_0_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 1 + */ +CTEST(zsbmv, upper_k_1_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 1; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, upper_k_2_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, upper_k_2_inc_b_2_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is upper-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 2 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, upper_k_2_inc_b_2_inc_c_2_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 2; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 0 + */ +CTEST(zsbmv, lower_k_0_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 1 + */ +CTEST(zsbmv, lower_k_1_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 1; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 1 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, lower_k_2_inc_b_1_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 1 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, lower_k_2_inc_b_2_inc_c_1_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 1; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test zsbmv by comparing it against zgemv + * with the following options: + * + * a is lower-band-packed symmetric matrix + * Number of rows and columns of A is 100 + * Stride of vector b_test is 2 + * Stride of vector c_test is 2 + * Number of super-diagonals k is 2 + */ +CTEST(zsbmv, lower_k_2_inc_b_2_inc_c_2_n_100) +{ + blasint n = DATASIZE, inc_b = 2, inc_c = 2; + blasint k = 2; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'L'; + + double alpha[] = {2.0, 1.0}; + double beta[] = {2.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Check if output matrix a contains any NaNs + */ +CTEST(zsbmv, check_for_NaN) +{ + blasint n = DATASIZE, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = n; + char uplo = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {1.0, 1.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + + ASSERT_TRUE(norm == norm); /* NaN == NaN is false */ +} + +/** + * Test error function for an invalid param uplo. + * Uplo specifies whether a is in upper (U) or lower (L) band-packed storage mode. + */ +CTEST(zsbmv, xerbla_uplo_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'O'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 1; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param N - + * number of rows and columns of A. Must be at least zero. + */ +CTEST(zsbmv, xerbla_n_invalid) +{ + blasint n = INVALID, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 2; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Check if n - number of rows and columns of A equal zero. + */ +CTEST(zsbmv, check_n_zero) +{ + blasint n = 0, inc_b = 1, inc_c = 1; + blasint k = 0; + blasint lda = k + 1; + blasint ldm = 1; + char uplo = 'U'; + + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm); + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV); +} + +/** + * Test error function for an invalid param inc_b - + * stride of vector b_test. Can't be zero. + */ +CTEST(zsbmv, xerbla_inc_b_zero) +{ + blasint n = 1, inc_b = 0, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 8; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_c - + * stride of vector c_test. Can't be zero. + */ +CTEST(zsbmv, xerbla_inc_c_zero) +{ + blasint n = 1, inc_b = 1, inc_c = 0; + char uplo = 'U'; + blasint k = 0; + blasint lda = k + 1; + int expected_info = 11; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param k - + * number of super-diagonals of A. Must be at least zero. + */ +CTEST(zsbmv, xerbla_k_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = INVALID; + blasint lda = 1; + int expected_info = 3; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param lda - + * specifies the leading dimension of a. Must be at least (k+1). + */ +CTEST(zsbmv, xerbla_lda_invalid) +{ + blasint n = 1, inc_b = 1, inc_c = 1; + char uplo = 'U'; + blasint k = 0; + blasint lda = INVALID; + int expected_info = 6; + + int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zscal.c b/utest/test_extensions/test_zscal.c new file mode 100644 index 000000000..132f4ee5b --- /dev/null +++ b/utest/test_extensions/test_zscal.c @@ -0,0 +1,165 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZSCAL { + double x_test[DATASIZE * 2 * INCREMENT]; + double x_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZSCAL data_zscal; + + +/** + * zscal reference code + * + * param n - number of elements of vector x + * param alpha - scaling factor for the vector product + * param x - buffer holding input vector x + * param inc - stride of vector x + */ +static void zscal_trusted(blasint n, double *alpha, double* x, blasint inc){ + blasint i, ip = 0; + blasint inc_x2 = 2 * inc; + double temp; + for (i = 0; i < n; i++) + { + temp = alpha[0] * x[ip] - alpha[1] * x[ip+1]; + x[ip+1] = alpha[0] * x[ip+1] + alpha[1] * x[ip]; + x[ip] = temp; + ip += inc_x2; + } +} + +/** + * Comapare results computed by zscal and zscal_trusted + * + * param api specifies tested api (C or Fortran) + * param n - number of elements of vector x + * param alpha - scaling factor for the vector product + * param inc - stride of vector x + * return norm of differences + */ +static double check_zscal(char api, blasint n, double *alpha, blasint inc) +{ + blasint i; + + // Fill vectors x + drand_generate(data_zscal.x_test, n * inc * 2); + + // Copy vector x for zscal_trusted + for (i = 0; i < n * 2 * inc; i++) + data_zscal.x_verify[i] = data_zscal.x_test[i]; + + zscal_trusted(n, alpha, data_zscal.x_verify, inc); + + if(api == 'F') + BLASFUNC(zscal)(&n, alpha, data_zscal.x_test, &inc); + else + cblas_zscal(n, alpha, data_zscal.x_test, inc); + + // Find the differences between output vector computed by zscal and zscal_trusted + for (i = 0; i < n * 2 * inc; i++) + data_zscal.x_verify[i] -= data_zscal.x_test[i]; + + // Find the norm of differences + return BLASFUNC(dznrm2)(&n, data_zscal.x_verify, &inc); +} + +/** + * Fortran API specific test + * Test zscal by comparing it against reference + */ +CTEST(zscal, alpha_r_zero_alpha_i_not_zero) +{ + blasint N = DATASIZE; + blasint inc = 1; + double alpha[2] = {0.0, 1.0}; + + double norm = check_zscal('F', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Fortran API specific test + * Test zscal by comparing it against reference + */ +CTEST(zscal, alpha_r_zero_alpha_i_zero_inc_2) +{ + blasint N = DATASIZE; + blasint inc = 2; + double alpha[2] = {0.0, 0.0}; + + double norm = check_zscal('F', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zscal by comparing it against reference + */ +CTEST(zscal, c_api_alpha_r_zero_alpha_i_not_zero) +{ + blasint N = DATASIZE; + blasint inc = 1; + double alpha[2] = {0.0, 1.0}; + + double norm = check_zscal('C', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * C API specific test + * Test zscal by comparing it against reference + */ +CTEST(zscal, c_api_alpha_r_zero_alpha_i_zero_inc_2) +{ + blasint N = DATASIZE; + blasint inc = 2; + double alpha[2] = {0.0, 0.0}; + + double norm = check_zscal('C', N, alpha, inc); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_zspmv.c b/utest/test_extensions/test_zspmv.c new file mode 100644 index 000000000..510ac0579 --- /dev/null +++ b/utest/test_extensions/test_zspmv.c @@ -0,0 +1,427 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include "common.h" + +#define DATASIZE 100 +#define INCREMENT 2 + +struct DATA_ZSPMV { + double a_verify[DATASIZE * DATASIZE * 2]; + double a_test[DATASIZE * (DATASIZE + 1)]; + double b_test[DATASIZE * 2 * INCREMENT]; + double c_test[DATASIZE * 2 * INCREMENT]; + double c_verify[DATASIZE * 2 * INCREMENT]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZSPMV data_zspmv; + +/** + * Compute spmv via gemv since spmv is gemv for symmetric packed matrix + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param alpha - scaling factor for the matrix-vector product + * param a - buffer holding input matrix A + * param b - Buffer holding input vector b + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param c - buffer holding input/output vector c + * param inc_c - stride of vector c + * output param data_zspmv.c_verify - matrix computed by gemv + */ +static void zspmv_trusted(char uplo, blasint n, double *alpha, double *a, + double *b, blasint inc_b, double *beta, double *c, + blasint inc_c) +{ + blasint k; + blasint i, j; + + // param for gemv (can use any, since the input matrix is symmetric) + char trans = 'N'; + + // Unpack the input symmetric packed matrix + if (uplo == 'L') + { + k = 0; + for (i = 0; i < n; i++) + { + for (j = 0; j < n * 2; j += 2) + { + if (j / 2 < i) + { + data_zspmv.a_verify[i * n * 2 + j] = + data_zspmv.a_verify[j * n + i * 2]; + data_zspmv.a_verify[i * n * 2 + j + 1] = + data_zspmv.a_verify[j * n + i * 2 + 1]; + } + else + { + data_zspmv.a_verify[i * n * 2 + j] = a[k++]; + data_zspmv.a_verify[i * n * 2 + j + 1] = a[k++]; + } + } + } + } + else + { + k = n * (n + 1) - 1; + for (j = 2 * n - 1; j >= 0; j -= 2) + { + for (i = n - 1; i >= 0; i--) + { + if (j / 2 < i) + { + data_zspmv.a_verify[i * n * 2 + j] = + data_zspmv.a_verify[(j - 1) * n + i * 2 + 1]; + data_zspmv.a_verify[i * n * 2 + j - 1] = + data_zspmv.a_verify[(j - 1) * n + i * 2]; + } + else + { + data_zspmv.a_verify[i * n * 2 + j] = a[k--]; + data_zspmv.a_verify[i * n * 2 + j - 1] = a[k--]; + } + } + } + } + + // Run gemv with unpacked matrix + BLASFUNC(zgemv)(&trans, &n, &n, alpha, data_zspmv.a_verify, &n, b, + &inc_b, beta, c, &inc_c); +} + +/** + * Comapare results computed by zspmv and zspmv_trusted + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param alpha - scaling factor for the matrix-vector product + * param inc_b - stride of vector b + * param beta - scaling factor for vector c + * param inc_c - stride of vector c + * return norm of differences + */ +static double check_zspmv(char uplo, blasint n, double *alpha, blasint inc_b, + double *beta, blasint inc_c) +{ + blasint i; + + // Fill symmetric packed maxtix a, vectors b and c + drand_generate(data_zspmv.a_test, n * (n + 1)); + drand_generate(data_zspmv.b_test, 2 * n * inc_b); + drand_generate(data_zspmv.c_test, 2 * n * inc_c); + + // Copy vector c for zspmv_trusted + for (i = 0; i < n * 2 * inc_c; i++) + data_zspmv.c_verify[i] = data_zspmv.c_test[i]; + + zspmv_trusted(uplo, n, alpha, data_zspmv.a_test, data_zspmv.b_test, + inc_b, beta, data_zspmv.c_verify, inc_c); + BLASFUNC(zspmv)(&uplo, &n, alpha, data_zspmv.a_test, data_zspmv.b_test, + &inc_b, beta, data_zspmv.c_test, &inc_c); + + // Find the differences between output vector caculated by zspmv and zspmv_trusted + for (i = 0; i < n * 2 * inc_c; i++) + data_zspmv.c_test[i] -= data_zspmv.c_verify[i]; + + // Find the norm of differences + return BLASFUNC(dznrm2)(&n, data_zspmv.c_test, &inc_c); +} + +/** + * Check if error function was called with expected function name + * and param info + * + * param uplo specifies whether matrix A is upper or lower triangular + * param n - number of rows and columns of A + * param inc_b - stride of vector b + * param inc_c - stride of vector c + * param expected_info - expected invalid parameter number in zspmv + * return TRUE if everything is ok, otherwise FALSE + */ +static int check_badargs(char uplo, blasint n, blasint inc_b, + blasint inc_c, int expected_info) +{ + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + set_xerbla("ZSPMV ", expected_info); + + BLASFUNC(zspmv)(&uplo, &n, alpha, data_zspmv.a_test, data_zspmv.b_test, + &inc_b, beta, data_zspmv.c_test, &inc_c); + + return check_error(); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zspmv, upper_inc_b_1_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 2 + */ +CTEST(zspmv, upper_inc_b_1_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 2; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 1 + */ +CTEST(zspmv, upper_inc_b_2_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 1; + char uplo = 'U'; + double alpha[] = {1.0, 0.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is upper triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(zspmv, upper_inc_b_2_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 2; + char uplo = 'U'; + double alpha[] = {2.5, -2.1}; + double beta[] = {0.0, 1.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 1 + */ +CTEST(zspmv, lower_inc_b_1_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 1 + * Stride of vector c is 2 + */ +CTEST(zspmv, lower_inc_b_1_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 2; + char uplo = 'L'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 1 + */ +CTEST(zspmv, lower_inc_b_2_inc_c_1_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 1; + char uplo = 'L'; + double alpha[] = {1.0, 0.0}; + double beta[] = {1.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Test zspmv by comparing it against zgemv + * with the following options: + * + * A is lower triangular + * Number of rows and columns of A is 100 + * Stride of vector b is 2 + * Stride of vector c is 2 + */ +CTEST(zspmv, lower_inc_b_2_inc_c_2_N_100) +{ + blasint N = DATASIZE, inc_b = 2, inc_c = 2; + char uplo = 'L'; + double alpha[] = {2.5, -2.1}; + double beta[] = {0.0, 1.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL); +} + +/** + * Check if output matrix A contains any NaNs + */ +CTEST(zspmv, check_for_NaN) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'U'; + double alpha[] = {1.0, 1.0}; + double beta[] = {0.0, 0.0}; + + double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c); + + ASSERT_TRUE(norm == norm); /* NaN == NaN is false */ +} + +/** + * Test error function for an invalid param uplo. + * uplo specifies whether A is upper or lower triangular. + */ +CTEST(zspmv, xerbla_uplo_invalid) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 1; + char uplo = 'O'; + int expected_info = 1; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param N - + * number of rows and columns of A. Must be at least zero. + */ +CTEST(zspmv, xerbla_N_invalid) +{ + blasint N = INVALID, inc_b = 1, inc_c = 1; + char uplo = 'U'; + int expected_info = 2; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_b - + * stride of vector b. Can't be zero. + */ +CTEST(zspmv, xerbla_inc_b_zero) +{ + blasint N = DATASIZE, inc_b = 0, inc_c = 1; + char uplo = 'U'; + int expected_info = 6; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} + +/** + * Test error function for an invalid param inc_c - + * stride of vector c. Can't be zero. + */ +CTEST(zspmv, xerbla_inc_c_zero) +{ + blasint N = DATASIZE, inc_b = 1, inc_c = 0; + char uplo = 'U'; + int expected_info = 9; + + int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info); + ASSERT_EQUAL(TRUE, passed); +} +#endif diff --git a/utest/test_extensions/test_ztrmv.c b/utest/test_extensions/test_ztrmv.c new file mode 100644 index 000000000..aad64099e --- /dev/null +++ b/utest/test_extensions/test_ztrmv.c @@ -0,0 +1,266 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 300 +#define INCREMENT 2 + +struct DATA_ZTRMV { + double a_test[DATASIZE * DATASIZE * 2]; + double a_verify[DATASIZE * DATASIZE * 2]; + double x_test[DATASIZE * INCREMENT * 2]; + double x_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZTRMV data_ztrmv; + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * + * param uplo specifies whether A is upper or lower triangular + * param trans specifies op(A), the transposition (conjugation) operation applied to A + * param diag specifies whether the matrix A is unit triangular or not. + * param n - numbers of rows and columns of A + * param lda - leading dimension of matrix A + * param incx - increment for the elements of x + * return norm of difference + */ +static double check_ztrmv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx) +{ + blasint i; + double alpha_conj[] = {1.0, 0.0}; + char trans_verify = trans; + + srand_generate(data_ztrmv.a_test, n * lda * 2); + srand_generate(data_ztrmv.x_test, n * incx * 2); + + for (i = 0; i < n * lda * 2; i++) + data_ztrmv.a_verify[i] = data_ztrmv.a_test[i]; + + for (i = 0; i < n * incx * 2; i++) + data_ztrmv.x_verify[i] = data_ztrmv.x_test[i]; + + if (trans == 'R'){ + cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ztrmv.a_verify, lda, lda); + trans_verify = 'N'; + } + + BLASFUNC(ztrmv)(&uplo, &trans_verify, &diag, &n, data_ztrmv.a_verify, &lda, + data_ztrmv.x_verify, &incx); + + BLASFUNC(ztrmv)(&uplo, &trans, &diag, &n, data_ztrmv.a_test, &lda, + data_ztrmv.x_test, &incx); + + for (i = 0; i < n * incx * 2; i++) + data_ztrmv.x_verify[i] -= data_ztrmv.x_test[i]; + + return BLASFUNC(dznrm2)(&n, data_ztrmv.x_verify, &incx); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + */ +CTEST(ztrmv, conj_notrans_upper_not_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + */ +CTEST(ztrmv, conj_notrans_upper_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + */ +CTEST(ztrmv, conj_notrans_lower_not_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + */ +CTEST(ztrmv, conj_notrans_lower_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ztrmv, conj_notrans_upper_not_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ztrmv, conj_notrans_upper_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ztrmv, conj_notrans_lower_not_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrmv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ztrmv, conj_notrans_lower_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrmv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/test_ztrsv.c b/utest/test_extensions/test_ztrsv.c new file mode 100644 index 000000000..ae556f5e2 --- /dev/null +++ b/utest/test_extensions/test_ztrsv.c @@ -0,0 +1,267 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "utest/openblas_utest.h" +#include +#include "common.h" + +#define DATASIZE 300 +#define INCREMENT 2 + +struct DATA_ZTRSV { + double a_test[DATASIZE * DATASIZE * 2]; + double a_verify[DATASIZE * DATASIZE * 2]; + double x_test[DATASIZE * INCREMENT * 2]; + double x_verify[DATASIZE * INCREMENT * 2]; +}; + +#ifdef BUILD_COMPLEX16 +static struct DATA_ZTRSV data_ztrsv; + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * + * param uplo specifies whether A is upper or lower triangular + * param trans specifies op(A), the transposition (conjugation) operation applied to A + * param diag specifies whether the matrix A is unit triangular or not. + * param n - numbers of rows and columns of A + * param lda - leading dimension of matrix A + * param incx - increment for the elements of x + * return norm of difference + */ +static double check_ztrsv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx) +{ + blasint i; + double alpha_conj[] = {1.0, 0.0}; + char trans_verify = trans; + + srand_generate(data_ztrsv.a_test, n * lda * 2); + srand_generate(data_ztrsv.x_test, n * incx * 2); + + for (i = 0; i < n * lda * 2; i++) + data_ztrsv.a_verify[i] = data_ztrsv.a_test[i]; + + for (i = 0; i < n * incx * 2; i++) + data_ztrsv.x_verify[i] = data_ztrsv.x_test[i]; + + if (trans == 'R'){ + cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, + alpha_conj, data_ztrsv.a_verify, lda, lda); + trans_verify = 'N'; + } + + BLASFUNC(ztrsv)(&uplo, &trans_verify, &diag, &n, data_ztrsv.a_verify, + &lda, data_ztrsv.x_verify, &incx); + + BLASFUNC(ztrsv)(&uplo, &trans, &diag, &n, data_ztrsv.a_test, &lda, + data_ztrsv.x_test, &incx); + + for (i = 0; i < n * incx * 2; i++) + data_ztrsv.x_verify[i] -= data_ztrsv.x_test[i]; + + return BLASFUNC(dznrm2)(&n, data_ztrsv.x_verify, &incx); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + */ +CTEST(ztrsv, conj_notrans_upper_not_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + */ +CTEST(ztrsv, conj_notrans_upper_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + */ +CTEST(ztrsv, conj_notrans_lower_not_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + */ +CTEST(ztrsv, conj_notrans_lower_unit_triangular) +{ + blasint n = DATASIZE, incx = 1, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ztrsv, conj_notrans_upper_not_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is upper triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ztrsv, conj_notrans_upper_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'U'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is not unit triangular + * vector x stride is 2 + */ +CTEST(ztrsv, conj_notrans_lower_not_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'N'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} + +/** + * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A + * and comparing it with the non-conjugate ztrsv. + * Test with the following options: + * + * matrix A is conjugate and not-trans + * matrix A is lower triangular + * matrix A is unit triangular + * vector x stride is 2 + */ +CTEST(ztrsv, conj_notrans_lower_unit_triangular_incx_2) +{ + blasint n = DATASIZE, incx = 2, lda = DATASIZE; + char uplo = 'L'; + char diag = 'U'; + char trans = 'R'; + + double norm = check_ztrsv(uplo, trans, diag, n, lda, incx); + + ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); +} +#endif \ No newline at end of file diff --git a/utest/test_extensions/xerbla.c b/utest/test_extensions/xerbla.c new file mode 100644 index 000000000..9487b20a6 --- /dev/null +++ b/utest/test_extensions/xerbla.c @@ -0,0 +1,88 @@ +/***************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +static int link_xerbla=TRUE; +static int lerr, _info, ok; +static char *rout; + +static void F77_xerbla(char *srname, void *vinfo) +{ + int info=*(int*)vinfo; + + if (link_xerbla) + { + link_xerbla = 0; + return; + } + + if (rout != NULL && strcmp(rout, srname) != 0){ + printf("***** XERBLA WAS CALLED WITH SRNAME = <%s> INSTEAD OF <%s> *******\n", srname, rout); + ok = FALSE; + } + + if (info != _info){ + printf("***** XERBLA WAS CALLED WITH INFO = %d INSTEAD OF %d in %s *******\n",info, _info, srname); + lerr = TRUE; + ok = FALSE; + } else lerr = FALSE; +} + +/** +* error function redefinition +*/ +int BLASFUNC(xerbla)(char *name, blasint *info, blasint length) +{ + F77_xerbla(name, info); + return 0; +} + +int check_error(void) { + if (lerr == TRUE ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", _info, rout); + ok = FALSE; + } + lerr = TRUE; + return ok; +} + +void set_xerbla(char* current_rout, int expected_info){ + if (link_xerbla) /* call these first to link */ + F77_xerbla(rout, &_info); + + ok = TRUE; + lerr = TRUE; + _info = expected_info; + rout = current_rout; +} \ No newline at end of file From c99e231fc5bb9ae661547089b561f0a66e99431e Mon Sep 17 00:00:00 2001 From: Andrey Sokolov Date: Thu, 18 Jan 2024 23:54:51 +0300 Subject: [PATCH 597/718] Fix rand_generate --- utest/test_extensions/common.c | 4 ++-- utest/test_extensions/test_zgeadd.c | 4 ++-- utest/test_extensions/test_ztrmv.c | 4 ++-- utest/test_extensions/test_ztrsv.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/utest/test_extensions/common.c b/utest/test_extensions/common.c index c3bdcefc7..8a6a47795 100644 --- a/utest/test_extensions/common.c +++ b/utest/test_extensions/common.c @@ -40,14 +40,14 @@ void srand_generate(float *alpha, blasint n) { blasint i; for (i = 0; i < n; i++) - alpha[i] = (float)rand() / (float)RAND_MAX * 5.0f; + alpha[i] = (float)rand() / (float)RAND_MAX; } void drand_generate(double *alpha, blasint n) { blasint i; for (i = 0; i < n; i++) - alpha[i] = (double)rand() / (double)RAND_MAX * 5.0; + alpha[i] = (double)rand() / (double)RAND_MAX; } /** diff --git a/utest/test_extensions/test_zgeadd.c b/utest/test_extensions/test_zgeadd.c index 917c04829..e50f86de0 100644 --- a/utest/test_extensions/test_zgeadd.c +++ b/utest/test_extensions/test_zgeadd.c @@ -103,8 +103,8 @@ static double check_zgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order, } // Fill matrix A, C - srand_generate(data_zgeadd.a_test, lda * rows * 2); - srand_generate(data_zgeadd.c_test, ldc * rows * 2); + drand_generate(data_zgeadd.a_test, lda * rows * 2); + drand_generate(data_zgeadd.c_test, ldc * rows * 2); // Copy matrix C for zgeadd for (i = 0; i < ldc * rows * 2; i++) diff --git a/utest/test_extensions/test_ztrmv.c b/utest/test_extensions/test_ztrmv.c index aad64099e..5668ec296 100644 --- a/utest/test_extensions/test_ztrmv.c +++ b/utest/test_extensions/test_ztrmv.c @@ -66,8 +66,8 @@ static double check_ztrmv(char uplo, char trans, char diag, blasint n, blasint l double alpha_conj[] = {1.0, 0.0}; char trans_verify = trans; - srand_generate(data_ztrmv.a_test, n * lda * 2); - srand_generate(data_ztrmv.x_test, n * incx * 2); + drand_generate(data_ztrmv.a_test, n * lda * 2); + drand_generate(data_ztrmv.x_test, n * incx * 2); for (i = 0; i < n * lda * 2; i++) data_ztrmv.a_verify[i] = data_ztrmv.a_test[i]; diff --git a/utest/test_extensions/test_ztrsv.c b/utest/test_extensions/test_ztrsv.c index ae556f5e2..4b7ec6aaf 100644 --- a/utest/test_extensions/test_ztrsv.c +++ b/utest/test_extensions/test_ztrsv.c @@ -66,8 +66,8 @@ static double check_ztrsv(char uplo, char trans, char diag, blasint n, blasint l double alpha_conj[] = {1.0, 0.0}; char trans_verify = trans; - srand_generate(data_ztrsv.a_test, n * lda * 2); - srand_generate(data_ztrsv.x_test, n * incx * 2); + drand_generate(data_ztrsv.a_test, n * lda * 2); + drand_generate(data_ztrsv.x_test, n * incx * 2); for (i = 0; i < n * lda * 2; i++) data_ztrsv.a_verify[i] = data_ztrsv.a_test[i]; From ccbc3f875bc87e92c1ab05f3b361f64d7fd95c87 Mon Sep 17 00:00:00 2001 From: Octavian Maghiar Date: Fri, 19 Jan 2024 12:40:00 +0000 Subject: [PATCH 598/718] [RISC-V] Add RISCV64_ZVL128B target to common_riscv64.h --- common_riscv64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_riscv64.h b/common_riscv64.h index f11e8b75d..4b5f7dcc4 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) +#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) # include #endif From aaf65210ccba0c53408c242a2e0f5ad5d798d532 Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Fri, 19 Jan 2024 19:04:21 +0000 Subject: [PATCH 599/718] Add dynamic support for Arm(R) Neoverse(TM) V2 processor Whilst I figure out how best to map the L2 parameters without duplicating all of `ARMV8SVE`, lets just map this to `NEOVERSEV1`. --- driver/others/dynamic_arm64.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 803e0b5eb..6b21028d1 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -1,6 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ -/* Copyright 2023 The OpenBLAS Project */ +/* Copyright 2023-2024 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -143,12 +143,13 @@ extern gotoblas_t gotoblas_ARMV8SVE; #endif extern gotoblas_t gotoblas_THUNDERX3T110; #endif +#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 16 +#define NUM_CORETYPES 17 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -178,6 +179,7 @@ static char *corename[] = { "emag8180", "neoversen1", "neoversev1", + "neoversev2", "neoversen2", "thunderx3t110", "cortexa55", @@ -198,10 +200,11 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11]; - if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; - if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; - if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; - if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; + if (gotoblas == &gotoblas_NEOVERSEV2) return corename[12]; + if (gotoblas == &gotoblas_NEOVERSEN2) return corename[13]; + if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14]; + if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; + if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; return corename[NUM_CORETYPES]; } @@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) { case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); case 11: return (&gotoblas_NEOVERSEV1); - case 12: return (&gotoblas_NEOVERSEN2); - case 13: return (&gotoblas_THUNDERX3T110); - case 14: return (&gotoblas_CORTEXA55); - case 15: return (&gotoblas_ARMV8SVE); + case 12: return (&gotoblas_NEOVERSEV2); + case 13: return (&gotoblas_NEOVERSEN2); + case 14: return (&gotoblas_THUNDERX3T110); + case 15: return (&gotoblas_CORTEXA55); + case 16: return (&gotoblas_ARMV8SVE); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -312,6 +316,13 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_NEOVERSEN1; }else return &gotoblas_NEOVERSEV1; + case 0xd4f: + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); + return &gotoblas_NEOVERSEN1; + } else { + return &gotoblas_NEOVERSEV2; + } #endif case 0xd05: // Cortex A55 return &gotoblas_CORTEXA55; From 304a9b60afbc79be77193cc3bdb5bb5d503aa533 Mon Sep 17 00:00:00 2001 From: Han Gao/Revy/Rabenda Date: Sun, 21 Jan 2024 14:32:52 +0000 Subject: [PATCH 600/718] Update T-Head toolchains v2.8.0 Signed-off-by: Han Gao/Revy/Rabenda --- .github/workflows/c910v.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index 30cf32b34..68ba2ddd7 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -14,8 +14,8 @@ jobs: if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-latest env: - xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282 - toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz + xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618 + toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz strategy: fail-fast: false matrix: @@ -76,7 +76,7 @@ jobs: run: | wget ${xuetie_toolchain}/${toolchain_file_name} tar -xvf ${toolchain_file_name} -C /opt - export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH" + export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH" make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) From e1afb23811256b231c259ca57d7a5f6e81ac6da5 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Fri, 7 Apr 2023 11:13:23 +0300 Subject: [PATCH 601/718] Fix BLAS and LAPACK tests for C910V and RISCV64_ZVL256B targets * Fixed bugs in dgemm, [a]min\max, asum kernels * Added zero checks for BLAS kernels * Added dsdot implementation for RVV 0.7.1 * Fixed bugs in _vector files for C910V and RISCV64_ZVL256B targets * Added additional definitions for RISCV64_ZVL256B target --- Makefile.prebuild | 4 + Makefile.riscv64 | 4 + TargetList.txt | 1 + getarch.c | 14 +++ kernel/riscv64/KERNEL.C910V | 1 + kernel/riscv64/amin_vector.c | 6 +- kernel/riscv64/asum_vector.c | 7 +- kernel/riscv64/axpby_vector.c | 2 +- kernel/riscv64/dgemm_kernel_8x4_c910v.c | 2 +- kernel/riscv64/dsdot_vector.c | 152 ++++++++++++++++++++++++ kernel/riscv64/iamin_vector.c | 4 +- kernel/riscv64/izamin_vector.c | 2 +- kernel/riscv64/nrm2_vector.c | 2 +- kernel/riscv64/nrm2_vector_dot.c | 2 +- kernel/riscv64/swap_vector.c | 2 +- kernel/riscv64/zamax_vector.c | 17 +-- kernel/riscv64/zamin_vector.c | 17 +-- kernel/riscv64/znrm2_vector.c | 2 +- kernel/riscv64/zswap_vector.c | 2 +- 19 files changed, 205 insertions(+), 38 deletions(-) create mode 100644 kernel/riscv64/dsdot_vector.c diff --git a/Makefile.prebuild b/Makefile.prebuild index c4f4a2602..d30275f06 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -59,6 +59,10 @@ ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif +ifeq ($(TARGET), RISCV64_ZVL256B) +TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d +endif + ifeq ($(TARGET), RISCV64_GENERIC) TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d endif diff --git a/Makefile.riscv64 b/Makefile.riscv64 index ce7a27141..2239a3676 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -6,6 +6,10 @@ ifeq ($(CORE), x280) CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif +ifeq ($(CORE), RISCV64_ZVL256B) +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static +endif ifeq ($(CORE), RISCV64_GENERIC) CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static diff --git a/TargetList.txt b/TargetList.txt index f76f605cc..f65a18b50 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -121,6 +121,7 @@ Z14 RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) C910V x280 +RISCV64_ZVL256B 11.LOONGARCH64: LOONGSONGENERIC diff --git a/getarch.c b/getarch.c index 772836347..12ea72052 100644 --- a/getarch.c +++ b/getarch.c @@ -1692,6 +1692,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_RISCV64_ZVL256B +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_ZVL256B" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_ZVL256B " \ + "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_zvl256b" +#define CORENAME "RISCV64_ZVL256B" +#else +#endif + #if defined(FORCE_E2K) || defined(__e2k__) #define FORCE diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 0da66fa35..2798a870e 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -59,6 +59,7 @@ SDOTKERNEL = dot_vector.c DDOTKERNEL = dot_vector.c CDOTKERNEL = zdot_vector.c ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = dsdot_vector.c SNRM2KERNEL = nrm2_vector.c DNRM2KERNEL = nrm2_vector.c diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index 1c541f0fd..c4578eabf 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -31,15 +31,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # define LMUL m2 # if defined(DOUBLE) # define ELEN 64 +# define ABS fabs # else # define ELEN 32 +# define ABS fabsf # endif #else # define LMUL m8 # if defined(DOUBLE) # define ELEN 64 +# define ABS fabs # else # define ELEN 32 +# define ABS fabsf # endif #endif @@ -69,7 +73,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=0.0; if (n <= 0 || inc_x <= 0) return(minf); - minf = *x; + minf = ABS(*x); x += inc_x; --n; if (n == 0) return(minf); diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c index 995dbf9a1..a652eafdd 100644 --- a/kernel/riscv64/asum_vector.c +++ b/kernel/riscv64/asum_vector.c @@ -67,7 +67,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0, j=0; - BLASLONG ix=0; FLOAT asumf=0.0; if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; @@ -103,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) unsigned int stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); - BLASLONG inc_xv = inc_x * gvl; for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle32_v_f32m2(&x[j], gvl); + vy = vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + } + }else if(inc_y == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else if(inc_x == 1){ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vle32_v_f32m2(&x[j], gvl); + vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + }else{ + gvl = vsetvl_e64m4(n); + vr = vfmv_v_f_f64m4(0, gvl); + int stride_x = inc_x * sizeof(FLOAT); + int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + //tail + if(j < n){ + gvl = vsetvl_e64m4(n-j); + vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); + //vr = vfdot_vv_f32m2(vx, vy, gvl); + vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); + dot += (double)vfmv_f_s_f64m1_f64(v_res); + + } + } + return(dot); +} diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index a58872960..0e591e697 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -139,7 +139,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); FLOAT cur_minf = EXTRACT_FLOAT(v_res); - if(cur_minf > minf){ + if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); @@ -185,7 +185,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); FLOAT cur_minf = EXTRACT_FLOAT(v_res); - if(cur_minf > minf){ + if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index a3877a46c..c76a38099 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -156,7 +156,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); FLOAT cur_minf = EXTRACT_FLOAT(v_res); - if(cur_minf > minf){ + if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); v_min_index = VADDVX_UINT(v_min_index, j, gvl); diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c index 141dffebf..5c03fbec7 100644 --- a/kernel/riscv64/nrm2_vector.c +++ b/kernel/riscv64/nrm2_vector.c @@ -104,7 +104,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; - if(n <= 0) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); if(n == 1) return (ABS(x[0])); unsigned int gvl = 0; diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c index 06e61d695..dfa13a6f5 100644 --- a/kernel/riscv64/nrm2_vector_dot.c +++ b/kernel/riscv64/nrm2_vector_dot.c @@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG i=0, j=0; double len = 0.0 ; - if ( n < 0 ) return(0.0); + if ( n <= 0 ) return(0.0); if(n == 1) return (ABS(x[0])); FLOAT_V_T vr, v0, v1; diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c index 3b467a586..f583f5392 100644 --- a/kernel/riscv64/swap_vector.c +++ b/kernel/riscv64/swap_vector.c @@ -67,7 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG stride_x, stride_y; FLOAT_V_T vx0, vx1, vy0, vy1; - if (n < 0) return(0); + if (n <= 0) return(0); unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index 2dee5ab29..ec4a5a1e9 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -60,17 +60,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) #ifdef RISCV_0p10_INTRINSICS #define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) (v_res, va, vb, gvl) -#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) (va, vb, vb, c, gvl) #else #define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) -#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f, ELEN, LMUL, _m) #endif #define MASK_T JOIN(vbool, MLEN, _t, _, _) -#define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) #define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) #define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) +#define VFABSV_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -91,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) for(; i Date: Tue, 23 Jan 2024 17:15:53 +0100 Subject: [PATCH 602/718] reset "mem structure overflowed" state on shutdown --- driver/others/memory.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index caef3e2b7..4ee8f9a2e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -3214,7 +3214,7 @@ void blas_shutdown(void){ #endif memory[pos].lock = 0; } - if (memory_overflowed) + if (memory_overflowed) { for (pos = 0; pos < NEW_BUFFERS; pos ++){ newmemory[pos].addr = (void *)0; newmemory[pos].used = 0; @@ -3222,6 +3222,10 @@ void blas_shutdown(void){ newmemory[pos].pos = -1; #endif newmemory[pos].lock = 0; + } + free(newmemory); + newmemory = NULL; + memory_overflowed = 0; } UNLOCK_COMMAND(&alloc_lock); From 86943afa9cb96a49ad036c1a484390967b322b5e Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Wed, 24 Jan 2024 10:53:13 +0300 Subject: [PATCH 603/718] Fix x280 taget include riscv_vector.h --- common_riscv64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_riscv64.h b/common_riscv64.h index 4b5f7dcc4..ab3bfa25a 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) +#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) # include #endif From 73530b03fa6ecd03e7ceb2b37c234a0bb1626445 Mon Sep 17 00:00:00 2001 From: Andrey Sokolov Date: Wed, 24 Jan 2024 11:38:14 +0300 Subject: [PATCH 604/718] remove RISCV64_ZVL256B additional extentions --- Makefile.prebuild | 2 +- Makefile.riscv64 | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index 7824e15a8..98acca80e 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -60,7 +60,7 @@ TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d endif ifeq ($(TARGET), RISCV64_ZVL256B) -TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d +TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d endif ifeq ($(TARGET), RISCV64_ZVL128B) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index 9d314d074..113cc57c5 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -7,8 +7,8 @@ CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_ZVL256B) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d -FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static +CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d +FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_ZVL128B) CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d From 276e3ebf9e1405196d36c2570bce8376b8bad2fd Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 26 Jan 2024 10:03:50 +0800 Subject: [PATCH 605/718] LoongArch64: Add dzamax and dzamin opt --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/camax_lasx.S | 150 +++++++++------- kernel/loongarch64/camax_lsx.S | 195 ++++++++++++--------- kernel/loongarch64/camin_lasx.S | 164 ++++++++++-------- kernel/loongarch64/camin_lsx.S | 209 +++++++++++++---------- 6 files changed, 418 insertions(+), 304 deletions(-) diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c365e9a75..e27ce3bee 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lsx.S SAMAXKERNEL = amax_lsx.S DAMAXKERNEL = amax_lsx.S CAMAXKERNEL = camax_lsx.S +ZAMAXKERNEL = camax_lsx.S SAMINKERNEL = amin_lsx.S DAMINKERNEL = amin_lsx.S CAMINKERNEL = camin_lsx.S +ZAMINKERNEL = camin_lsx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 68360faaf..f4429cfba 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lasx.S SAMAXKERNEL = amax_lasx.S DAMAXKERNEL = amax_lasx.S CAMAXKERNEL = camax_lasx.S +ZAMAXKERNEL = camax_lasx.S SAMINKERNEL = amin_lasx.S DAMINKERNEL = amin_lasx.S CAMINKERNEL = camin_lasx.S +ZAMINKERNEL = camin_lasx.S SMAXKERNEL = max_lsx.S DMAXKERNEL = max_lsx.S diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S index 7013430cb..f9a4e9012 100644 --- a/kernel/loongarch64/camax_lasx.S +++ b/kernel/loongarch64/camax_lasx.S @@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - xvreplgr2vr.w neg1, I - xvffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 8 * SIZE - addi.d I, I, -1 + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 - xvfmul.s x3, neg1, x1 - xvfmul.s x4, neg1, x2 - xvfcmp.clt.s VT0, x1, res0 - xvfcmp.clt.s VT1, x2, res0 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VT1 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMAX VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - xvfadd.s VM1, x1, x2 - xvfmax.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMAX VM0, x1, x2 +#else xvpickve.w x1, VM0, 0 xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmax.s VM1, x1, x2 - xvfmax.s VM0, x3, x4 - xvfmax.s VM0, VM0, VM1 + XVFMAX VM0, x1, x2 + XVFMAX VM1, x3, x4 + XVFMAX VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmax.s s1, s1, s2 - fmax.s s3, s3, s4 - fmax.s s1, s1, s3 + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 .align 3 .L23: //N<8 @@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FABS a1, a1 ADD a0, a0, a1 add.d X, X, INCX - fmax.s s1, a0, s1 + FMAX s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S index 2e55629de..cf46cb016 100644 --- a/kernel/loongarch64/camax_lsx.S +++ b/kernel/loongarch64/camax_lsx.S @@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - vreplgr2vr.w neg1, I - vffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 - vld VX0, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vld VX1, X, 12 * SIZE - vfadd.s VM1, x1, x2 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMAX VM1, x1, VM1 + VFMAX VM0, VM0, VM1 +#endif addi.d X, X, 16 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vfadd.s x1, x1, x2 - vfmax.s VM1, x1, VM1 - vfmax.s VM0, VM0, VM1 + addi.d I, I, -1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMAX VM0, x1, x2 +#else vreplvei.w x1, VM0, 0 vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmax.s VM1, x1, x2 - vfmax.s VM0, x3, x4 - vfmax.s VM0, VM0, VM1 + VFMAX VM1, x1, x2 + VFMAX VM0, x3, x4 + VFMAX VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmax.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMAX s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmax.s s1, s1, s2 - fmax.s s3, s3, s4 - fmax.s s1, s1, s3 + FMAX s1, s1, s2 + FMAX s3, s3, s4 + FMAX s1, s1, s3 .align 3 .L23: //N<8 @@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L24: - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE addi.d I, I, -1 - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s a0, a0, a1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 add.d X, X, INCX - fmax.s s1, a0, s1 + FMAX s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S index d7931d30a..c1c4c98c8 100644 --- a/kernel/loongarch64/camin_lasx.S +++ b/kernel/loongarch64/camin_lasx.S @@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvxor.v res0, res0, res0 bge $r0, N, .L999 bge $r0, INCX, .L999 - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s s1, a1, a0 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + xvreplve0.d VM0, VM0 +#else xvreplve0.w VM0, VM0 +#endif li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - xvreplgr2vr.w neg1, I - xvffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvld VX1, X, 8 * SIZE - addi.d I, I, -1 + xvld VX0, X, 0 + xvld VX1, X, 32 +#ifdef DOUBLE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 - xvfmul.s x3, neg1, x1 - xvfmul.s x4, neg1, x2 - xvfcmp.clt.s VT0, x1, res0 - xvfcmp.clt.s VT1, x2, res0 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VT1 +#endif + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#ifdef DOUBLE + xvld VX0, X, 64 + xvld VX1, X, 96 + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + XVFSUB x3, res0, x1 + XVFSUB x4, res0, x2 + XVFMAX x1, x1, x3 + XVFMAX x2, x2, x4 + XVFADD VM1, x1, x2 + XVFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - xvfadd.s VM1, x1, x2 - xvfmin.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + xvpickve.d x1, VM0, 0 + xvpickve.d x2, VM0, 1 + XVFMIN VM0, x1, x2 +#else xvpickve.w x1, VM0, 0 xvpickve.w x2, VM0, 1 xvpickve.w x3, VM0, 2 xvpickve.w x4, VM0, 3 - xvfmin.s VM1, x1, x2 - xvfmin.s VM0, x3, x4 - xvfmin.s VM0, VM0, VM1 + XVFMIN VM0, x1, x2 + XVFMIN VM1, x3, x4 + XVFMIN VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmin.s s1, s1, s2 - fmin.s s3, s3, s4 - fmin.s s1, s1, s3 + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 .align 3 .L23: //N<8 @@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FABS a1, a1 ADD a0, a0, a1 add.d X, X, INCX - fmin.s s1, a0, s1 + FMIN s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S index e9ad6b04d..ff666ea8f 100644 --- a/kernel/loongarch64/camin_lsx.S +++ b/kernel/loongarch64/camin_lsx.S @@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vxor.v res0, res0, res0 bge $r0, N, .L999 bge $r0, INCX, .L999 - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s s1, a1, a0 + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE + FABS a0, a0 + FABS a1, a1 + ADD s1, a1, a0 +#ifdef DOUBLE + vreplvei.d VM0, VM0, 0 +#else vreplvei.w VM0, VM0, 0 +#endif li.d TEMP, 1 - li.w I, -1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT - vreplgr2vr.w neg1, I - vffint.s.w neg1, neg1 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L23 .align 3 .L10: - vld VX0, X, 0 * SIZE - vld VX1, X, 4 * SIZE - addi.d I, I, -1 + vld VX0, X, 0 + vld VX1, X, 16 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 - vld VX0, X, 8 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vld VX1, X, 12 * SIZE - vfadd.s VM1, x1, x2 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 32 + vld VX1, X, 48 +#ifdef DOUBLE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 +#else vpickev.w x1, VX1, VX0 vpickod.w x2, VX1, VX0 - vfmul.s x3, neg1, x1 - vfmul.s x4, neg1, x2 - vfcmp.clt.s VT0, x1, res0 - vfcmp.clt.s VT1, x2, res0 +#endif + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#ifdef DOUBLE + vld VX0, X, 64 + vld VX1, X, 80 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD VM1, x1, x2 + + vld VX0, X, 96 + vld VX1, X, 112 + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + VFSUB x3, res0, x1 + VFSUB x4, res0, x2 + VFMAX x1, x1, x3 + VFMAX x2, x2, x4 + VFADD x1, x1, x2 + VFMIN VM1, x1, VM1 + VFMIN VM0, VM0, VM1 +#endif + addi.d I, I, -1 addi.d X, X, 16 * SIZE - vbitsel.v x1, x1, x3, VT0 - vbitsel.v x2, x2, x4, VT1 - vfadd.s x1, x1, x2 - vfmin.s VM1, x1, VM1 - vfmin.s VM0, VM0, VM1 blt $r0, I, .L10 .align 3 .L11: +#ifdef DOUBLE + vreplvei.d x1, VM0, 0 + vreplvei.d x2, VM0, 1 + VFMIN VM0, x1, x2 +#else vreplvei.w x1, VM0, 0 vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmin.s VM1, x1, x2 - vfmin.s VM0, x3, x4 - vfmin.s VM0, VM0, VM1 + VFMIN VM1, x1, x2 + VFMIN VM0, x3, x4 + VFMIN VM0, VM0, VM1 +#endif b .L23 .align 3 @@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L21: - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s1, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s1, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 addi.d I, I, -1 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s3, t1, t3 - fld.s t1, X, 0 * SIZE - fld.s t2, X, 1 * SIZE + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s3, t1, t3 + LD t1, X, 0 * SIZE + LD t2, X, 1 * SIZE add.d X, X, INCX - fld.s t3, X, 0 * SIZE - fld.s t4, X, 1 * SIZE + LD t3, X, 0 * SIZE + LD t4, X, 1 * SIZE add.d X, X, INCX - fabs.s t1, t1 - fabs.s t2, t2 - fabs.s t3, t3 - fabs.s t4, t4 - fadd.s t1, t1, t2 - fadd.s t3, t3, t4 - fmin.s s4, t1, t3 + FABS t1, t1 + FABS t2, t2 + FABS t3, t3 + FABS t4, t4 + ADD t1, t1, t2 + ADD t3, t3, t4 + FMIN s4, t1, t3 blt $r0, I, .L21 .align 3 .L22: - fmin.s s1, s1, s2 - fmin.s s3, s3, s4 - fmin.s s1, s1, s3 + FMIN s1, s1, s2 + FMIN s3, s3, s4 + FMIN s1, s1, s3 .align 3 .L23: //N<8 @@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L24: - fld.s a0, X, 0 * SIZE - fld.s a1, X, 1 * SIZE + LD a0, X, 0 * SIZE + LD a1, X, 1 * SIZE addi.d I, I, -1 - fabs.s a0, a0 - fabs.s a1, a1 - fadd.s a0, a0, a1 + FABS a0, a0 + FABS a1, a1 + ADD a0, a0, a1 add.d X, X, INCX - fmin.s s1, a0, s1 + FMIN s1, a0, s1 blt $r0, I, .L24 .align 3 .L999: - fmov.s $f0, $f22 + MOV $f0, $f22 jirl $r0, $r1, 0x0 .align 3 From ec2aa32eb069a8aa9b2013c1f6af6b4cddb1b0dc Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Thu, 25 Jan 2024 15:20:58 +0000 Subject: [PATCH 606/718] Fix crash in cpuid_riscv64.c The crash is reproducible when building OpenBLAS without forcing a target in a riscv64 container running on an X86_64 machine with an older version of QEMU, e.g., 7.0.0, registered with binfmt_misc to run riscv64 binaries. With this setup, cat /proc/cpuinfo in the container returns the cpu information for the host, which contains a "model name" string, and we execute the buggy code. The code in question is searching in an uninitialised buffer for the ':' character and doesn't check to see whether it was found or not. This can result in pmodel containing the pointer value 1 and a crash when pmodel is defererenced. The algorithm to detect the C910V CPU has not been modified, merely fixed to prevent the crash. A few additional checks for NULL pointers are added to improve the robustness of the code and a whitespace error is corrected. --- cpuid_riscv64.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 894d2b873..7e08af831 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -86,23 +86,29 @@ int detect(void){ char *pmodel = NULL, *pisa = NULL; infile = fopen("/proc/cpuinfo", "r"); + if (!infile) + return CPU_GENERIC; while (fgets(buffer, sizeof(buffer), infile)){ if(!strncmp(buffer, "model name", 10)){ strcpy(model_buffer, buffer); - pmodel = strchr(isa_buffer, ':') + 1; + pmodel = strchr(model_buffer, ':'); + if (pmodel) + pmodel++; } if(!strncmp(buffer, "isa", 3)){ strcpy(isa_buffer, buffer); - pisa = strchr(isa_buffer, '4') + 1; + pisa = strchr(isa_buffer, '4'); + if (pisa) + pisa++; } } fclose(infile); - if (!pmodel) + if (!pmodel || !pisa) return(CPU_GENERIC); - + if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v')) return CPU_C910V; From e0b610d01ff4cdb44fd2453b7efa7f51ea84e575 Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Fri, 26 Jan 2024 13:57:33 +0000 Subject: [PATCH 607/718] Harmonize riscv64 LIBNAME for forced and non-forced targets The forced values for LIBNAME were either riscv64_generic or c910v while the non-forced value of LIBNAME was always riscv64. --- cpuid_riscv64.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 7e08af831..13009753a 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -78,6 +78,11 @@ static char *cpuname[] = { "C910V" }; +static char *cpuname_lower[] = { + "riscv64_generic", + "c910v" +}; + int detect(void){ #ifdef __linux FILE *infile; @@ -146,5 +151,5 @@ void get_cpuconfig(void){ } void get_libname(void){ - printf("riscv64\n"); + printf("%s", cpuname_lower[detect()]); } From 48a4c4d454530d46c0143f4f9bd3f5f7a8374757 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 26 Jan 2024 16:30:52 +0100 Subject: [PATCH 608/718] Use +sve in arch declarations of the fallback paths for SVE targets --- Makefile.arm64 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index ed52a9424..ca053b03d 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -116,13 +116,13 @@ endif endif endif else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif endif else -CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif @@ -138,7 +138,7 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 @@ -156,13 +156,13 @@ endif endif endif else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif endif else -CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif From 519ea6e87aa357787896986836f853192c829930 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 10:39:22 +0800 Subject: [PATCH 609/718] utest: Add utest for the {sc/dz}amax and {s/d/sc/dz}amin --- utest/CMakeLists.txt | 1 + utest/Makefile | 3 +- utest/test_amax.c | 35 +++++++++++++++-- utest/test_amin.c | 89 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 utest/test_amin.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index c47954ce4..41829bd22 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -16,6 +16,7 @@ else () test_dnrm2.c test_swap.c test_zscal.c + test_amin.c ) endif () diff --git a/utest/Makefile b/utest/Makefile index d0715c754..8acaa3ea9 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,8 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o +OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ + test_amin.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_amax.c b/utest/test_amax.c index a9e5a1c85..e9775caf0 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2016, The OpenBLAS Project +Copyright (c) 2011-2024, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,9 +13,9 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" @@ -57,4 +57,31 @@ CTEST(amax, damax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); } #endif +#ifdef BUILD_COMPLEX +CTEST(amax, scamax){ + blasint N = 9, inc = 1; + float te_max = 0.0, tr_max = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + te_max = BLASFUNC(scamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amax, dzamax){ + blasint N = 9, inc = 1; + double te_max = 0.0, tr_max = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_max = BLASFUNC(dzamax)(&N, x, &inc); + tr_max = 20.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} +#endif diff --git a/utest/test_amin.c b/utest/test_amin.c new file mode 100644 index 000000000..1305ab8ef --- /dev/null +++ b/utest/test_amin.c @@ -0,0 +1,89 @@ +/***************************************************************************** +Copyright (c) 2011-2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#ifdef BUILD_SINGLE +CTEST(amin, samin){ + blasint N = 3, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(samin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_DOUBLE +CTEST(amin, damin){ + blasint N = 3, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9 }; + + te_min = BLASFUNC(damin)(&N, x, &inc); + tr_min = 1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX +CTEST(amin, scamin){ + blasint N = 9, inc = 1; + float te_min = 0.0, tr_min = 0.0; + float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(scamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} +#endif +#ifdef BUILD_COMPLEX16 +CTEST(amin, dzamin){ + blasint N = 9, inc = 1; + double te_min = 0.0, tr_min = 0.0; + double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, + -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, + -7.7, 8.8 }; + + te_min = BLASFUNC(dzamin)(&N, x, &inc); + tr_min = 3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} +#endif From a79d11740580db13f101b76c81a02f93654de9a7 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 11:03:56 +0800 Subject: [PATCH 610/718] LoogArch64: Fixed bug for {s/d}amin --- kernel/loongarch64/amin_lasx.S | 1 - kernel/loongarch64/amin_lsx.S | 1 - 2 files changed, 2 deletions(-) diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S index 0a4359002..c91a33006 100644 --- a/kernel/loongarch64/amin_lasx.S +++ b/kernel/loongarch64/amin_lasx.S @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else xvldrepl.w VM0, X, 0 #endif - XVFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 4 diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S index 644caf43c..c3c3f4ae9 100644 --- a/kernel/loongarch64/amin_lsx.S +++ b/kernel/loongarch64/amin_lsx.S @@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else vldrepl.w VM0, X, 0 #endif - VFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 3 From 3d4dfd008556b5a722162def487e0553f807e6e8 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 11:25:59 +0800 Subject: [PATCH 611/718] Benchmark: Rename the executable file names for {sc/dz}a{min/max} No interface named {c/z}a{min/max}, keeping it would cause ambiguity --- benchmark/Makefile | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index d9ddb9042..6a7c54636 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -265,9 +265,9 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ ismax.goto idmax.goto \ isamin.goto idamin.goto icamin.goto izamin.goto \ ismin.goto idmin.goto \ - samax.goto damax.goto camax.goto zamax.goto \ + samax.goto damax.goto scamax.goto dzamax.goto \ smax.goto dmax.goto \ - samin.goto damin.goto camin.goto zamin.goto \ + samin.goto damin.goto scamin.goto dzamin.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) @@ -2832,12 +2832,12 @@ samax.goto : samax.$(SUFFIX) ../$(LIBNAME) damax.goto : damax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## CAMAX ############################################## -camax.goto : camax.$(SUFFIX) ../$(LIBNAME) +############################################## SCAMAX ############################################## +scamax.goto : scamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## ZAMAX ############################################## -zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) +############################################## DZAMAX ############################################## +dzamax.goto : dzamax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ############################################## SMAX ############################################## @@ -2856,12 +2856,12 @@ samin.goto : samin.$(SUFFIX) ../$(LIBNAME) damin.goto : damin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) +############################################## SCAMIN ############################################## +scamin.goto : scamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) +############################################## DZAMIN ############################################## +dzamin.goto : dzamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ############################################## SMIN ############################################## @@ -3383,10 +3383,10 @@ samax.$(SUFFIX) : amax.c damax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -camax.$(SUFFIX) : amax.c +scamax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zamax.$(SUFFIX) : amax.c +dzamax.$(SUFFIX) : amax.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ @@ -3403,10 +3403,10 @@ samin.$(SUFFIX) : amin.c damin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -camin.$(SUFFIX) : amin.c +scamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zamin.$(SUFFIX) : amin.c +dzamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ From 0d7fe5ea610d46afaed9f5164f6a11729e2429de Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Mon, 29 Jan 2024 22:33:47 -0800 Subject: [PATCH 612/718] clean up whitespace --- driver/others/blas_server_win32.c | 192 +++++++++++++----------------- 1 file changed, 85 insertions(+), 107 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 5820a55f4..68dde584b 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -72,19 +72,9 @@ static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; static volatile int thread_target; // target num of live threads, volatile for cross-thread reads -#if defined (__GNUC__) && (__GNUC__ < 6) - #define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) -#else - #if defined(_WIN64) - #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) - #else - #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) - #endif -#endif - static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ - if (!(mode & BLAS_COMPLEX)){ + if (!(mode & BLAS_COMPLEX)) { #ifdef EXPRECISION if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* REAL / Extended Double */ @@ -99,7 +89,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if ((mode & BLAS_PREC) == BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE) { /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, @@ -110,7 +100,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ + } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, @@ -122,7 +112,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> b, args -> ldb, args -> c, args -> ldc, sb); #ifdef BUILD_BFLOAT16 - } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ + } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) { /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, @@ -133,7 +123,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ + } else if ((mode & BLAS_PREC) == BLAS_STOBF16) { /* REAL / BLAS_STOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, bfloat16 *, BLASLONG, @@ -144,7 +134,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ + } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) { /* REAL / BLAS_DTOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, bfloat16 *, BLASLONG, @@ -161,7 +151,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } else { #ifdef EXPRECISION - if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE) { /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -175,7 +165,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if ((mode & BLAS_PREC) == BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE) { /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, @@ -205,10 +195,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } -/* This is a main routine of threads. Each thread waits until job is */ -/* queued. */ - -static DWORD WINAPI blas_thread_server(void *arg){ +// This is a main routine of threads. Each thread waits until job is +// queued. +static DWORD WINAPI blas_thread_server(void *arg) { /* Thread identifier */ BLASLONG cpu = (BLASLONG)arg; @@ -221,24 +210,22 @@ static DWORD WINAPI blas_thread_server(void *arg){ MT_TRACE("Server[%2ld] Thread is started!\n", cpu); - while (1){ + while (1) { /* Waiting for Queue */ MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); - // event raised when work is added to the queue - WaitForSingleObject(kickoff_event, INFINITE); + // event raised when work is added to the queue + WaitForSingleObject(kickoff_event, INFINITE); - if (cpu > thread_target - 2) - { - //MT_TRACE("thread [%d] exiting.\n", cpu); - break; // excess thread, so worker thread exits - } + if (cpu > thread_target - 2) { + //MT_TRACE("thread [%d] exiting.\n", cpu); + break; // excess thread, so worker thread exits + } MT_TRACE("Server[%2ld] Got it.\n", cpu); -#if 1 EnterCriticalSection(&queue_lock); queue = work_queue; @@ -246,51 +233,39 @@ static DWORD WINAPI blas_thread_server(void *arg){ work_queue = work_queue->next; LeaveCriticalSection(&queue_lock); -#else - volatile blas_queue_t* queue_next; - - INT_PTR prev_value; - do { - queue = (volatile blas_queue_t*)work_queue; - if (!queue) - break; - - queue_next = (volatile blas_queue_t*)queue->next; - prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue); - } while (prev_value != queue); -#endif - if (queue) { + if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; sa = queue -> sa; sb = queue -> sb; -#ifdef CONSISTENT_FPCSR - __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); - __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); -#endif + #ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); + #endif MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); // fprintf(stderr, "queue start[%ld]!!!\n", cpu); -#ifdef MONITOR - main_status[cpu] = MAIN_RUNNING1; -#endif + #ifdef MONITOR + main_status[cpu] = MAIN_RUNNING1; + #endif - if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + if (sa == NULL) + sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sb == NULL) { - if (!(queue -> mode & BLAS_COMPLEX)){ + if (!(queue -> mode & BLAS_COMPLEX)) { #ifdef EXPRECISION - if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { #ifdef BUILD_DOUBLE sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); @@ -324,26 +299,25 @@ static DWORD WINAPI blas_thread_server(void *arg){ /* Other types in future */ } } - queue->sb=sb; + queue->sb=sb; } -#ifdef MONITOR - main_status[cpu] = MAIN_RUNNING2; -#endif + #ifdef MONITOR + main_status[cpu] = MAIN_RUNNING2; + #endif if (!(queue -> mode & BLAS_LEGACY)) { - - (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); } else { - legacy_exec(routine, queue -> mode, queue -> args, sb); + legacy_exec(routine, queue -> mode, queue -> args, sb); } - }else{ - continue; //if queue == NULL - } + } else { + continue; //if queue == NULL + } MT_TRACE("Server[%2ld] Finished!\n", cpu); - queue->finished = 1; + queue->finished = 1; } /* Shutdown procedure */ @@ -353,10 +327,12 @@ static DWORD WINAPI blas_thread_server(void *arg){ blas_memory_free(buffer); return 0; - } +} -/* Initializing routine */ -int blas_thread_init(void){ +// +// Initializing routine +// +int blas_thread_init(void) { BLASLONG i; if (blas_server_avail || (blas_cpu_number <= 1)) return 0; @@ -365,16 +341,16 @@ int blas_thread_init(void){ MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); - if (!blas_server_avail){ - // create the kickoff Event - kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); + if (!blas_server_avail) { + // create the kickoff Event + kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); - thread_target = blas_cpu_number; + thread_target = blas_cpu_number; InitializeCriticalSection(&queue_lock); - for(i = 0; i < blas_cpu_number - 1; i++){ - //MT_TRACE("thread_init: creating thread [%d]\n", i); + for(i = 0; i < blas_cpu_number - 1; i++) { + //MT_TRACE("thread_init: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, @@ -391,13 +367,10 @@ int blas_thread_init(void){ /* User can call one of two routines. - exec_blas_async ... immediately returns after jobs are queued. - exec_blas ... returns after jobs are finished. */ - -int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ +int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { #if defined(SMP_SERVER) // Handle lazy re-init of the thread-pool after a POSIX fork @@ -417,7 +390,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); #endif - current->finished = 0; + current->finished = 0; current = current -> next; pos ++; } @@ -426,18 +399,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ if (!work_queue) { - work_queue = queue; + work_queue = queue; } else { blas_queue_t *next_item = work_queue; - // find the end of the work queue - while (next_item) - next_item = next_item->next; + // find the end of the work queue + while (next_item) + next_item = next_item->next; - // add new work to the end - next_item = queue; + // add new work to the end + next_item = queue; } LeaveCriticalSection(&queue_lock); @@ -447,20 +420,24 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ return 0; } -int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ +// +// Join. Wait for all queued tasks to complete +// +int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) { - MT_TRACE("Synchronization Waiting.\n"); + MT_TRACE("Synchronization Waiting.\n"); - while (num){ - MT_TRACE("Waiting Queue ..\n"); - while (!queue->finished) - YIELDING; + while (num) { + MT_TRACE("Waiting Queue ..\n"); - queue = queue->next; - num--; - } + while (!queue->finished) + YIELDING; - MT_TRACE("Completely Done.\n\n"); + queue = queue->next; + num--; + } + + MT_TRACE("Completely Done.\n\n"); // if work was added to the queue after this batch we can't sleep the worker threads // by resetting the event @@ -474,8 +451,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ return 0; } -/* Execute Threads */ -int exec_blas(BLASLONG num, blas_queue_t *queue){ +// +// Execute Threads +// +int exec_blas(BLASLONG num, blas_queue_t *queue) { #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) // Handle lazy re-init of the thread-pool after a POSIX fork @@ -507,9 +486,8 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ return 0; } -/* Shutdown procedure, but user don't have to call this routine. The */ -/* kernel automatically kill threads. */ - +// Shutdown procedure, but user don't have to call this routine. The +// kernel automatically kill threads. int BLASFUNC(blas_thread_shutdown)(void){ int i; @@ -518,9 +496,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ LOCK_COMMAND(&server_lock); - if (blas_server_avail){ + if (blas_server_avail) { - for(i = 0; i < blas_num_threads - 1; i++){ + for(i = 0; i < blas_num_threads - 1; i++) { // Could also just use WaitForMultipleObjects DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); @@ -555,7 +533,7 @@ void goto_set_num_threads(int num_threads) if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; - if (blas_server_avail && num_threads < blas_num_threads) { + if (blas_server_avail && num_threads < blas_num_threads) { LOCK_COMMAND(&server_lock); thread_target = num_threads; @@ -586,7 +564,7 @@ void goto_set_num_threads(int num_threads) thread_target = num_threads; //increased_threads = 1; - if (!blas_server_avail){ + if (!blas_server_avail) { // create the kickoff Event kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); @@ -595,7 +573,7 @@ void goto_set_num_threads(int num_threads) blas_server_avail = 1; } - for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ + for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { //MT_TRACE("set_num_threads: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, From 83ce97a4ca44c1aedc9f825bcb11f3a999f09c60 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 16:54:14 +0800 Subject: [PATCH 613/718] LoongArch64: Handle NAN and INF --- kernel/loongarch64/cscal_lasx.S | 149 +------------------------------- kernel/loongarch64/cscal_lsx.S | 130 +--------------------------- 2 files changed, 4 insertions(+), 275 deletions(-) diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S index 3605a6c0e..f53526663 100644 --- a/kernel/loongarch64/cscal_lasx.S +++ b/kernel/loongarch64/cscal_lasx.S @@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L113 //alpha_r != 0.0 && alpha_i == 0.0 .L14: - bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 b .L111 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -117,38 +117,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: //alpha_r == 0.0 && alpha_i != 0.0 - xvld VX0, X, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvpickev.d x1, VX1, VX0 - xvpickod.d x2, VX1, VX0 - xvfmul.d x3, VXAI, x2 - xvfsub.d x3, VXZ, x3 - xvfmul.d x4, VXAI, x1 - xvilvl.d VX2, x4 ,x3 - xvilvh.d VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE -#else - xvld VX1, X, 8 * SIZE - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 - xvfmul.s x3, VXAI, x2 - xvfsub.s x3, VXZ, x3 - xvfmul.s x4, VXAI, x1 - xvilvl.w VX2, x4 ,x3 - xvilvh.w VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 8 * SIZE - addi.d X, X, 16 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - .L113: //alpha_r != 0.0 && alpha_i == 0.0 xvld VX0, X, 0 * SIZE #ifdef DOUBLE @@ -227,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L223 //alpha_r != 0.0 && alpha_i == 0.0 .L24: - bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 b .L221 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -275,119 +243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L222: //alpha_r == 0.0 && alpha_i != 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.d x1, t1, 0 - xvinsgr2vr.d x2, t2, 0 - xvinsgr2vr.d x1, t3, 1 - xvinsgr2vr.d x2, t4, 1 - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 - add.d X, X, INCX - - xvfmul.d x3, VXAI, x2 - xvfsub.d x3, VXZ, x3 - xvfmul.d x4, VXAI, x1 - addi.d I, I, -1 - xvstelm.d x3, XX, 0 * SIZE, 0 - xvstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 1 - xvstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 2 - xvstelm.d x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 3 - xvstelm.d x4, XX, 1 * SIZE, 3 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 0 - xvinsgr2vr.w x2, t2, 0 - xvinsgr2vr.w x1, t3, 1 - xvinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 2 - xvinsgr2vr.w x2, t2, 2 - xvinsgr2vr.w x1, t3, 3 - xvinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 - add.d X, X, INCX - - xvfmul.s x3, VXAI, x2 - xvfsub.s x3, VXZ, x3 - xvfmul.s x4, VXAI, x1 - addi.d I, I, -1 - xvstelm.w x3, XX, 0 * SIZE, 0 - xvstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 1 - xvstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 2 - xvstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 3 - xvstelm.w x4, XX, 1 * SIZE, 3 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 4 - xvstelm.w x4, XX, 1 * SIZE, 4 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 5 - xvstelm.w x4, XX, 1 * SIZE, 5 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 6 - xvstelm.w x4, XX, 1 * SIZE, 6 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 7 - xvstelm.w x4, XX, 1 * SIZE, 7 -#endif - add.d XX, XX, INCX - blt $r0, I, .L222 - b .L997 - .align 3 - .L223: //alpha_r != 0.0 && alpha_i == 0.0 #ifdef DOUBLE ld.d t1, X, 0 * SIZE diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S index f442a754f..241d3d16e 100644 --- a/kernel/loongarch64/cscal_lsx.S +++ b/kernel/loongarch64/cscal_lsx.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L113 //alpha_r != 0.0 && alpha_i == 0.0 .L14: - bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 b .L111 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -116,48 +116,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: //alpha_r == 0.0 && alpha_i != 0.0 - vld VX0, X, 0 * SIZE -#ifdef DOUBLE - vld VX1, X, 2 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - vilvl.d VX2, x4 ,x3 - vilvh.d VX3, x4, x3 - vst VX2, X, 0 * SIZE - vst VX3, X, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - vilvl.d VX2, x4 ,x3 - vilvh.d VX3, x4, x3 - vst VX2, X, 4 * SIZE - vst VX3, X, 6 * SIZE -#else - vld VX1, X, 4 * SIZE - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, VXAI, x2 - vfsub.s x3, VXZ, x3 - vfmul.s x4, VXAI, x1 - vilvl.w VX2, x4 ,x3 - vilvh.w VX3, x4, x3 - vst VX2, X, 0 * SIZE - vst VX3, X, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - .L113: //alpha_r != 0.0 && alpha_i == 0.0 vld VX0, X, 0 * SIZE #ifdef DOUBLE @@ -256,7 +214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L223 //alpha_r != 0.0 && alpha_i == 0.0 .L24: - bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0 + bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 b .L221 //alpha_r == 0.0 && alpha_i == 0.0 .align 3 @@ -292,90 +250,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L222: //alpha_r == 0.0 && alpha_i != 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - vstelm.d x3, XX, 0 * SIZE, 0 - vstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d x3, XX, 0 * SIZE, 1 - vstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - add.d X, X, INCX - vfmul.d x3, VXAI, x2 - vfsub.d x3, VXZ, x3 - vfmul.d x4, VXAI, x1 - addi.d I, I, -1 - vstelm.d x3, XX, 0 * SIZE, 0 - vstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d x3, XX, 0 * SIZE, 1 - vstelm.d x4, XX, 1 * SIZE, 1 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.w x1, t1, 0 - vinsgr2vr.w x2, t2, 0 - vinsgr2vr.w x1, t3, 1 - vinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - vinsgr2vr.w x1, t1, 2 - vinsgr2vr.w x2, t2, 2 - vinsgr2vr.w x1, t3, 3 - vinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - - vfmul.s x3, VXAI, x2 - vfsub.s x3, VXZ, x3 - vfmul.s x4, VXAI, x1 - addi.d I, I, -1 - vstelm.w x3, XX, 0 * SIZE, 0 - vstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 1 - vstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 2 - vstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 3 - vstelm.w x4, XX, 1 * SIZE, 3 -#endif - add.d XX, XX, INCX - blt $r0, I, .L222 - b .L997 - .align 3 - .L223: //alpha_r != 0.0 && alpha_i == 0.0 #ifdef DOUBLE ld.d t1, X, 0 * SIZE From bb043a021f138a3915c835776fdfe90673644db4 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 30 Jan 2024 17:27:59 +0800 Subject: [PATCH 614/718] utest: Add tests for zscal --- utest/test_zscal.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/utest/test_zscal.c b/utest/test_zscal.c index 8992eee90..ffc851e8b 100644 --- a/utest/test_zscal.c +++ b/utest/test_zscal.c @@ -20,6 +20,18 @@ CTEST(zscal, i_nan) ASSERT_TRUE(isnan(nan[17])); } +CTEST(zscal, i_nan_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, + NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, i, &nan, 2); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + CTEST(zscal, nan_i) { double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; @@ -30,7 +42,19 @@ CTEST(zscal, nan_i) ASSERT_TRUE(isnan(i[16])); ASSERT_TRUE(isnan(i[17])); } - + +CTEST(zscal, nan_i_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + cblas_zscal(9, &nan, &i, 2); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isnan(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isnan(i[17])); +} + CTEST(zscal, i_inf) { double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; @@ -40,7 +64,19 @@ CTEST(zscal, i_inf) ASSERT_TRUE(isinf(inf[1])); ASSERT_TRUE(isnan(inf[16])); ASSERT_TRUE(isinf(inf[17])); -} +} + +CTEST(zscal, i_inf_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, + INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, i, &inf, 2); + ASSERT_TRUE(isnan(inf[0])); + ASSERT_TRUE(isinf(inf[1])); + ASSERT_TRUE(isnan(inf[16])); + ASSERT_TRUE(isinf(inf[17])); +} CTEST(zscal, inf_i) { @@ -53,4 +89,16 @@ CTEST(zscal, inf_i) ASSERT_TRUE(isinf(i[17])); } +CTEST(zscal, inf_i_inc_2) +{ + double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, + 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 }; + double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0}; + cblas_zscal(9, &inf, &i, 2); + ASSERT_TRUE(isnan(i[0])); + ASSERT_TRUE(isinf(i[1])); + ASSERT_TRUE(isnan(i[16])); + ASSERT_TRUE(isinf(i[17])); +} + #endif From 09bb48d1b97aaf60ea40707658bf6761a9637424 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 30 Jan 2024 09:13:16 -0600 Subject: [PATCH 615/718] Vectorize in-copy packing/copying for SGEMM - 4X faster. --- kernel/power/KERNEL.POWER10 | 2 +- kernel/power/KERNEL.POWER8 | 7 +- kernel/power/KERNEL.POWER9 | 2 +- kernel/power/sgemm_ncopy_16_power.c | 482 ++++++++++++++++++++++++++++ 4 files changed, 487 insertions(+), 6 deletions(-) create mode 100755 kernel/power/sgemm_ncopy_16_power.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 9047c714c..c84cd91d2 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -25,7 +25,7 @@ ZTRMMKERNEL = zgemm_kernel_power10.S endif SGEMMKERNEL = sgemm_kernel_power10.c -SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 2b8e65948..36222348a 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,11 +1,9 @@ # Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) SGEMMKERNEL = gemm_kernel_power6.S -SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -50,11 +48,9 @@ CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -90,6 +86,9 @@ ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif +SGEMMINCOPY = sgemm_ncopy_16_power.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) + STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index b6b102b3e..7d007d1a2 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -13,7 +13,7 @@ CTRMMKERNEL = cgemm_kernel_power9.S ZTRMMKERNEL = zgemm_kernel_power9.S SGEMMKERNEL = sgemm_kernel_power9.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S diff --git a/kernel/power/sgemm_ncopy_16_power.c b/kernel/power/sgemm_ncopy_16_power.c new file mode 100755 index 000000000..babe1376e --- /dev/null +++ b/kernel/power/sgemm_ncopy_16_power.c @@ -0,0 +1,482 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp19 ; + IFLOAT ctemp21, ctemp23 ; + IFLOAT ctemp25, ctemp27 ; + IFLOAT ctemp29, ctemp31 ; + + aoffset = a; + boffset = b; + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset += 16 * lda; + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4, c5, c6, c7, c8; + vector float c9, c10, c11, c12, c13, c14, c15, c16; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + vector float t9, t10, t11, t12; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + c5 = vec_xl(0, aoffset5); + c6 = vec_xl(0, aoffset6); + c7 = vec_xl(0, aoffset7); + c8 = vec_xl(0, aoffset8); + c9 = vec_xl(0, aoffset9); + c10 = vec_xl(0, aoffset10); + c11 = vec_xl(0, aoffset11); + c12 = vec_xl(0, aoffset12); + c13 = vec_xl(0, aoffset13); + c14 = vec_xl(0, aoffset14); + c15 = vec_xl(0, aoffset15); + c16 = vec_xl(0, aoffset16); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_mergeh(c5, c6); + t4 = vec_mergeh(c7, c8); + t9 = vec_mergeh(c9, c10); + t10 = vec_mergeh(c11, c12); + t11 = vec_mergeh(c13, c14); + t12 = vec_mergeh(c15, c16); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t9, t10, 0b00); + t8 = vec_xxpermdi(t11, t12, 0b00); + + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + t5 = vec_xxpermdi(t1, t2, 0b11); + t6 = vec_xxpermdi(t3, t4, 0b11); + t7 = vec_xxpermdi(t9, t10, 0b11); + t8 = vec_xxpermdi(t11, t12, 0b11); + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_mergel(c5, c6); + t4 = vec_mergel(c7, c8); + t9 = vec_mergel(c9, c10); + t10 = vec_mergel(c11, c12); + t11 = vec_mergel(c13, c14); + t12 = vec_mergel(c15, c16); + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t9, t10, 0b00); + t8 = vec_xxpermdi(t11, t12, 0b00); + vec_xst(t5, 0, boffset+32); + vec_xst(t6, 0, boffset+36); + vec_xst(t7, 0, boffset+40); + vec_xst(t8, 0, boffset+44); + + t5 = vec_xxpermdi(t1, t2, 0b11); + t6 = vec_xxpermdi(t3, t4, 0b11); + t7 = vec_xxpermdi(t9, t10, 0b11); + t8 = vec_xxpermdi(t11, t12, 0b11); + vec_xst(t5, 0, boffset+48); + vec_xst(t6, 0, boffset+52); + vec_xst(t7, 0, boffset+56); + vec_xst(t8, 0, boffset+60); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + aoffset5 += 4; + aoffset6 += 4; + aoffset7 += 4; + aoffset8 += 4; + + aoffset9 += 4; + aoffset10 += 4; + aoffset11 += 4; + aoffset12 += 4; + aoffset13 += 4; + aoffset14 += 4; + aoffset15 += 4; + aoffset16 += 4; + boffset += 64; + + i --; + }while(i > 0); + } + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + ctemp17 = *(aoffset9 + 0); + ctemp19 = *(aoffset10 + 0); + ctemp21 = *(aoffset11 + 0); + ctemp23 = *(aoffset12 + 0); + ctemp25 = *(aoffset13 + 0); + ctemp27 = *(aoffset14 + 0); + ctemp29 = *(aoffset15 + 0); + ctemp31 = *(aoffset16 + 0); + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp17; + *(boffset + 9) = ctemp19; + *(boffset + 10) = ctemp21; + *(boffset + 11) = ctemp23; + *(boffset + 12) = ctemp25; + *(boffset + 13) = ctemp27; + *(boffset + 14) = ctemp29; + *(boffset + 15) = ctemp31; + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + aoffset5+=1; + aoffset6+=1; + aoffset7+=1; + aoffset8+=1; + aoffset9+=1; + aoffset10+=1; + aoffset11+=1; + aoffset12+=1; + aoffset13+=1; + aoffset14+=1; + aoffset15+=1; + aoffset16+=1; + boffset += 16; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4, c5, c6, c7, c8; + vector float t1, t2, t3, t4, t5, t6, t7, t8; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + c5 = vec_xl(0, aoffset5); + c6 = vec_xl(0, aoffset6); + c7 = vec_xl(0, aoffset7); + c8 = vec_xl(0, aoffset8); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + t3 = vec_mergeh(c5, c6); + t4 = vec_mergeh(c7, c8); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t1, t2, 0b11); + t8 = vec_xxpermdi(t3, t4, 0b11); + + vec_xst(t5, 0, boffset); + vec_xst(t6, 0, boffset+4); + vec_xst(t7, 0, boffset+8); + vec_xst(t8, 0, boffset+12); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + t3 = vec_mergel(c5, c6); + t4 = vec_mergel(c7, c8); + + t5 = vec_xxpermdi(t1, t2, 0b00); + t6 = vec_xxpermdi(t3, t4, 0b00); + t7 = vec_xxpermdi(t1, t2, 0b11); + t8 = vec_xxpermdi(t3, t4, 0b11); + + vec_xst(t5, 0, boffset+16); + vec_xst(t6, 0, boffset+20); + vec_xst(t7, 0, boffset+24); + vec_xst(t8, 0, boffset+28); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + aoffset5 += 4; + aoffset6 += 4; + aoffset7 += 4; + aoffset8 += 4; + + boffset += 32; + i--; + }while(i > 0); + } + + i = (m & 3); + if (i > 0) { + do { + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + aoffset5+=1; + aoffset6+=1; + aoffset7+=1; + aoffset8+=1; + + boffset += 8; + i--; + } while (i > 0); + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + vector float c1, c2, c3, c4; + vector float t1, t2, t3, t4; + do{ + c1 = vec_xl(0, aoffset1); + c2 = vec_xl(0, aoffset2); + c3 = vec_xl(0, aoffset3); + c4 = vec_xl(0, aoffset4); + + t1 = vec_mergeh(c1, c2); + t2 = vec_mergeh(c3, c4); + + t3 = vec_xxpermdi(t1, t2, 0b00); + t4 = vec_xxpermdi(t1, t2, 0b11); + + vec_xst(t3, 0, boffset); + vec_xst(t4, 0, boffset+4); + + t1 = vec_mergel(c1, c2); + t2 = vec_mergel(c3, c4); + + t3 = vec_xxpermdi(t1, t2, 0b00); + t4 = vec_xxpermdi(t1, t2, 0b11); + + vec_xst(t3, 0, boffset+8); + vec_xst(t4, 0, boffset+12); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + + boffset += 16; + i--; + }while(i > 0); + } + + i = (m & 3); + if (i > 0) { + do { + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + + aoffset1+=1; + aoffset2+=1; + aoffset3+=1; + aoffset4+=1; + + boffset += 4; + i--; + } while (i > 0); + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} From 969601a1dcfdc4c44174346b7c752fa338f00737 Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 31 Jan 2024 11:20:25 +0800 Subject: [PATCH 616/718] X86_64: Fixed bug in zscal Fixed handling of NAN and INF arguments when inc is greater than 1. --- kernel/x86_64/zscal.c | 91 ++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 66c8a0d2b..bc79c0caf 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x ) for( i=0; i FLT_MAX) { + else if (da_r < -FLT_MAX || da_r > FLT_MAX) { while(j < n) { x[i]= NAN; @@ -404,7 +413,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (x[i] < -FLT_MAX || x[i] > FLT_MAX) temp0 = NAN; x[i+1] = da_i * x[i]; - if ( x[i] == x[i]) //preserve NaN + if ( x[i] == x[i]) //preserve NaN x[i] = temp0; i += 2 ; j++; @@ -420,7 +429,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { while(j < n) { - + temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; x[i] = temp0; @@ -442,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } - } + } } From 1a6fdb035308370c08c740da279b769615594980 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 15:57:57 +0100 Subject: [PATCH 617/718] Add prototypes for extensions ?AMIN/?AMAX and CAXPYC/ZAXPYC --- cblas.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cblas.h b/cblas.h index ade2fca3a..3b74e25ee 100644 --- a/cblas.h +++ b/cblas.h @@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + +float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); +double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); +float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); +double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); + CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); @@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); +void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); +void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); + void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); @@ -290,7 +303,6 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); - void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, From b54cda849096ade35bd2f69341e3d02fa1543512 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 16:00:52 +0100 Subject: [PATCH 618/718] Unify creation of CBLAS interfaces for ?AMIN/?AMAX and C/ZAXPYC between gmake and cmake builds --- interface/CMakeLists.txt | 2 ++ interface/Makefile | 49 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 4e082928b..ed19b556a 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -130,6 +130,8 @@ endif () foreach (float_type ${FLOAT_TYPES}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") + GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) + GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) diff --git a/interface/Makefile b/interface/Makefile index 78335357b..99859cbf5 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -270,7 +270,8 @@ CSBLAS1OBJS = \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ - cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) + cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \ + cblas_samin.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ @@ -295,7 +296,8 @@ CDBLAS1OBJS = \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ - cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) + cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \ + cblas_damin.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ @@ -315,7 +317,7 @@ CCBLAS1OBJS = \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ - cblas_caxpby.$(SUFFIX) \ + cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \ cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) CCBLAS2OBJS = \ @@ -340,12 +342,12 @@ CXERBLAOBJ = \ CZBLAS1OBJS = \ cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ - cblas_zcopy.$(SUFFIX) \ + cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ - cblas_zaxpby.$(SUFFIX) \ + cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \ cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) @@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) +cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_camax.$(SUFFIX) cblas_camax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_zamax.$(SUFFIX) cblas_zamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_camin.$(SUFFIX) cblas_camin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_zamin.$(SUFFIX) cblas_zamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1627,6 +1653,19 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) From a7d004e820f1ccbc9f61b4b1353ccdb04f208690 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 17:55:42 +0100 Subject: [PATCH 619/718] Fix CBLAS prototype --- interface/max.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/interface/max.c b/interface/max.c index f05977448..6c7d32bd9 100644 --- a/interface/max.c +++ b/interface/max.c @@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ #else +#ifdef COMPLEX +FLOAT CNAME(blasint n, void *vx, blasint incx){ + FLOAT *x = (FLOAT*) vx; +#else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ - +#endif + FLOAT ret; PRINT_DEBUG_CNAME; From 47bd06476312598eea694f19a00a9191041b1586 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 31 Jan 2024 20:49:43 +0100 Subject: [PATCH 620/718] Fix names in build rules --- interface/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 99859cbf5..ad4a0fb89 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1541,10 +1541,10 @@ cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) -cblas_camax.$(SUFFIX) cblas_camax.$(PSUFFIX) : max.c +cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) -cblas_zamax.$(SUFFIX) cblas_zamax.$(PSUFFIX) : max.c +cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c @@ -1553,10 +1553,10 @@ cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) -cblas_camin.$(SUFFIX) cblas_camin.$(PSUFFIX) : max.c +cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) -cblas_zamin.$(SUFFIX) cblas_zamin.$(PSUFFIX) : max.c +cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c From 42cb567f0f9f6c8ef27558e5b61251b0805aae6d Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Wed, 31 Jan 2024 13:24:28 -0800 Subject: [PATCH 621/718] more cleanup --- driver/others/blas_server_win32.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 68dde584b..ee6d08f8c 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -72,7 +72,10 @@ static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; static volatile int thread_target; // target num of live threads, volatile for cross-thread reads -static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ +// +// +// +static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { if (!(mode & BLAS_COMPLEX)) { #ifdef EXPRECISION @@ -195,8 +198,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } -// This is a main routine of threads. Each thread waits until job is -// queued. +// +// This is a main routine of threads. Each thread waits until job is queued. +// static DWORD WINAPI blas_thread_server(void *arg) { /* Thread identifier */ @@ -488,7 +492,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue) { // Shutdown procedure, but user don't have to call this routine. The // kernel automatically kill threads. -int BLASFUNC(blas_thread_shutdown)(void){ +int BLASFUNC(blas_thread_shutdown)(void) { int i; @@ -563,7 +567,7 @@ void goto_set_num_threads(int num_threads) thread_target = num_threads; - //increased_threads = 1; + //increased_threads = 1; if (!blas_server_avail) { // create the kickoff Event kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); From 61c8e19f95dfefe3e586d6d83f7432b7d51299fa Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 31 Jan 2024 15:27:50 -0600 Subject: [PATCH 622/718] Fix Makefile to support OpenMP on AIX for xlc (clang) with xlf. --- ctest/Makefile | 3 +++ test/Makefile | 3 +++ utest/Makefile | 10 ++++++++++ 3 files changed, 16 insertions(+) diff --git a/ctest/Makefile b/ctest/Makefile index af5b34a36..ad960b35a 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -218,6 +218,9 @@ ifeq ($(F_COMPILER), IBM) ifeq ($(C_COMPILER), GCC) CEXTRALIB += -lgomp endif +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB += -lomp +endif endif endif diff --git a/test/Makefile b/test/Makefile index 56acf1c5b..5a4694ce6 100644 --- a/test/Makefile +++ b/test/Makefile @@ -276,6 +276,9 @@ ifeq ($(F_COMPILER), IBM) ifeq ($(C_COMPILER), GCC) CEXTRALIB += -lgomp endif +ifeq ($(C_COMPILER), CLANG) +CEXTRALIB += -lomp +endif endif endif diff --git a/utest/Makefile b/utest/Makefile index 8acaa3ea9..ac58d6f12 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -45,8 +45,18 @@ endif all : run_test +ifeq ($(OSNAME), AIX) +ifeq ($(USE_OPENMP), 1) +$(UTESTBIN): $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) +else +$(UTESTBIN): $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) +endif +else $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) +endif run_test: $(UTESTBIN) ifneq ($(CROSS), 1) From a3b0ef6596d51ecfb59b0a2f6a7b0d59bc4f18b4 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Thu, 1 Feb 2024 10:26:02 +0000 Subject: [PATCH 623/718] Restore riscv64 fixes from develop branch: dot product double precision accumulation, zscal NaN handling --- Makefile.prebuild | 1 + kernel/riscv64/dot.c | 10 ++++ kernel/riscv64/zscal_rvv.c | 90 ++++++----------------------------- kernel/riscv64/zscal_vector.c | 79 +----------------------------- 4 files changed, 26 insertions(+), 154 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index b44b50039..b7d695a75 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -57,6 +57,7 @@ endif ifeq ($(TARGET), CK860FV) TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float +endif ifeq ($(TARGET), x280) TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index bf55998ca..8ad493a2b 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -44,14 +44,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; + +#if defined(DSDOT) double dot = 0.0 ; +#else + FLOAT dot = 0.0 ; +#endif if ( n < 1 ) return(dot); while(i < n) { +#if defined(DSDOT) + dot += (double) y[iy] * (double) x[ix] ; +#else dot += y[iy] * x[ix] ; +#endif + ix += inc_x ; iy += inc_y ; i++ ; diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index 2586c6036..ae79d9f9d 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -69,49 +69,26 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F size_t vlmax = VSETVL_MAX; FLOAT_VX2_T vx2; - if(da_r == 0.0 && da_i == 0.0) { + if(inc_x == 1) { - vr = VFMVVF_FLOAT(0.0, vlmax); - vi = VFMVVF_FLOAT(0.0, vlmax); - - if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - vx2 = VSET_VX2(vx2, 0, vr); - vx2 = VSET_VX2(vx2, 1, vi); - VSSEG_FLOAT(x, vx2, vl); - } - - } else { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - vx2 = VSET_VX2(vx2, 0, vr); - vx2 = VSET_VX2(vx2, 1, vi); - VSSSEG_FLOAT(x, stride_x, vx2, vl); - } - } - - } else if(da_r == 0.0) { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { + for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); - - vx2 = VLSSEG_FLOAT(x, stride_x, vl); + + vx2 = VLSEG_FLOAT(x, vl); vr = VGET_VX2(vx2, 0); vi = VGET_VX2(vx2, 1); - vt = VFMULVF_FLOAT(vi, -da_i, vl); - vi = VFMULVF_FLOAT(vr, da_i, vl); + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); vx2 = VSET_VX2(vx2, 0, vt); vx2 = VSET_VX2(vx2, 1, vi); - - VSSSEG_FLOAT(x, stride_x, vx2, vl); + VSSEG_FLOAT(x, vx2, vl); } - } else if(da_i == 0.0) { + } else { for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); @@ -120,54 +97,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F vr = VGET_VX2(vx2, 0); vi = VGET_VX2(vx2, 1); - vr = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFMULVF_FLOAT(vr, da_r, vl); + vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); vi = VFMULVF_FLOAT(vi, da_r, vl); + vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - vx2 = VSET_VX2(vx2, 0, vr); + vx2 = VSET_VX2(vx2, 0, vt); vx2 = VSET_VX2(vx2, 1, vi); VSSSEG_FLOAT(x, stride_x, vx2, vl); } - - } else { - - if(inc_x == 1) { - - for (size_t vl; n > 0; n -= vl, x += vl*2) { - vl = VSETVL(n); - - vx2 = VLSEG_FLOAT(x, vl); - vr = VGET_VX2(vx2, 0); - vi = VGET_VX2(vx2, 1); - - vt = VFMULVF_FLOAT(vr, da_r, vl); - vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); - vi = VFMULVF_FLOAT(vi, da_r, vl); - vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - - vx2 = VSET_VX2(vx2, 0, vt); - vx2 = VSET_VX2(vx2, 1, vi); - VSSEG_FLOAT(x, vx2, vl); - } - - } else { - - for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { - vl = VSETVL(n); - - vx2 = VLSSEG_FLOAT(x, stride_x, vl); - vr = VGET_VX2(vx2, 0); - vi = VGET_VX2(vx2, 1); - - vt = VFMULVF_FLOAT(vr, da_r, vl); - vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); - vi = VFMULVF_FLOAT(vi, da_r, vl); - vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); - - vx2 = VSET_VX2(vx2, 0, vt); - vx2 = VSET_VX2(vx2, 1, vi); - VSSSEG_FLOAT(x, stride_x, vx2, vl); - } - } } return(0); diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 2034aafaa..536bbdf73 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -59,84 +59,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; - if(da_r == 0.0 && da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - vt = VFMVVF_FLOAT(0.0, gvl); - for(i=0,j=0; i < n/(gvl*2); i++){ - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); - - j += gvl*2; - ix += inc_xv*2; - } - for(; j < n; ){ - gvl = VSETVL(n-j); - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); - j += gvl; - ix += inc_x * 2 * gvl; - } - }else if(da_r == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v1, -da_i, gvl); - v1 = VFMULVF_FLOAT(v0, da_i, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else if(da_i == 0.0){ - gvl = VSETVL(n); - BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); - BLASLONG inc_xv = inc_x * 2 * gvl; - for(i=0,j=0; i < n/gvl; i++){ - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - - j += gvl; - ix += inc_xv; - } - if(j < n){ - gvl = VSETVL(n-j); - v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); - v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); - - vt = VFMULVF_FLOAT(v0, da_r, gvl); - v1 = VFMULVF_FLOAT(v1, da_r, gvl); - - VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); - VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); - } - }else{ + { gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG inc_xv = inc_x * 2 * gvl; From 2bb7ea64a197c04ce36927530394db5400242d20 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 1 Feb 2024 08:11:43 -0600 Subject: [PATCH 624/718] Only vectorize 64-bit version for Power8. --- kernel/power/KERNEL.POWER8 | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 36222348a..700a68e44 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,9 +1,11 @@ # Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) SGEMMKERNEL = gemm_kernel_power6.S +SGEMMINCOPY = SGEMMITCOPY = SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -48,9 +50,11 @@ CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMINCOPY = sgemm_ncopy_16_power.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -86,9 +90,6 @@ ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -SGEMMINCOPY = sgemm_ncopy_16_power.c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) - STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c From ddac75e0efef0deeb8ea189e69909371db13f0ef Mon Sep 17 00:00:00 2001 From: austinpagan Date: Thu, 1 Feb 2024 12:24:25 -0600 Subject: [PATCH 625/718] Adding .C versions of CGEMM and ZGEMM --- kernel/power/KERNEL.POWER10 | 24 +- kernel/power/cgemm_kernel_power10.c | 1154 +++++++++++++++++++++++++++ kernel/power/zgemm_kernel_power10.c | 761 ++++++++++++++++++ 3 files changed, 1931 insertions(+), 8 deletions(-) create mode 100644 kernel/power/cgemm_kernel_power10.c create mode 100644 kernel/power/zgemm_kernel_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 9047c714c..5f49b9c46 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -17,11 +17,15 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c ifeq ($(OSNAME), AIX) -CTRMMKERNEL = ctrmm_kernel_8x4_power8.S -ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S +#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +CTRMMKERNEL = cgemm_kernel_power10.c +ZTRMMKERNEL = zgemm_kernel_power10.c else -CTRMMKERNEL = cgemm_kernel_power10.S -ZTRMMKERNEL = zgemm_kernel_power10.S +#CTRMMKERNEL = cgemm_kernel_power10.S +#ZTRMMKERNEL = zgemm_kernel_power10.S +CTRMMKERNEL = cgemm_kernel_power10.c +ZTRMMKERNEL = zgemm_kernel_power10.c endif SGEMMKERNEL = sgemm_kernel_power10.c @@ -65,9 +69,11 @@ DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c ifeq ($(OSNAME), AIX) -CGEMMKERNEL = cgemm_kernel_8x4_power8.S +#CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMKERNEL = cgemm_kernel_power10.c else -CGEMMKERNEL = cgemm_kernel_power10.S +#CGEMMKERNEL = cgemm_kernel_power10.S +CGEMMKERNEL = cgemm_kernel_power10.c endif #CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c @@ -84,9 +90,11 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) ifeq ($(OSNAME), AIX) -ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMKERNEL = zgemm_kernel_power10.c else -ZGEMMKERNEL = zgemm_kernel_power10.S +#ZGEMMKERNEL = zgemm_kernel_power10.S +ZGEMMKERNEL = zgemm_kernel_power10.c endif ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c diff --git a/kernel/power/cgemm_kernel_power10.c b/kernel/power/cgemm_kernel_power10.c new file mode 100644 index 000000000..279c83aec --- /dev/null +++ b/kernel/power/cgemm_kernel_power10.c @@ -0,0 +1,1154 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined (TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define COMP_MUL_1 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) + +#define COMP_MAC_1(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ +} + +#define COMP_MUL_2A \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) + +#define COMP_MAC_2A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6]) \ +} + +#define COMP_MUL_2B \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12]) + +#define COMP_MAC_2B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ +} + +#define COMP_MUL_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MAC_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MUL_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + +#define COMP_MAC_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_1 \ + COMP_MAC_1(16) \ + COMP_MAC_1(32) \ + COMP_MAC_1(48) \ + COMP_MAC_1(64) \ + COMP_MAC_1(80) \ + COMP_MAC_1(96) \ + COMP_MAC_1(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2A \ + COMP_MAC_2A(16) \ + COMP_MAC_2A(32) \ + COMP_MAC_2A(48) \ + COMP_MAC_2A(64) \ + COMP_MAC_2A(80) \ + COMP_MAC_2A(96) \ + COMP_MAC_2A(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2B \ + COMP_MAC_2B(16) \ + COMP_MAC_2B(32) \ + COMP_MAC_2B(48) \ + COMP_MAC_2B(64) \ + COMP_MAC_2B(80) \ + COMP_MAC_2B(96) \ + COMP_MAC_2B(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(32) \ + COMP_MAC_4A(64) \ + COMP_MAC_4A(96) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(64) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4A(32) \ + COMP_MAC_4A(96) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[ 4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(32) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(48) \ + CO[12] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[13] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[14] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[15] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_24_ALL \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4); \ + __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1); \ + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc5); \ + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc2); \ + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc6); \ + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc3); \ + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); \ + COMP_MUL(tr[ 0], res[ 0], res[ 5], ti[ 0], res[ 1], res[ 4]) \ + COMP_MUL(tr[ 1], res[ 8], res[ 13], ti[ 1], res[ 9], res[ 12]) \ + COMP_MUL(tr[ 2], res[ 2], res[ 7], ti[ 2], res[ 3], res[ 6]) \ + COMP_MUL(tr[ 3], res[ 10], res[ 15], ti[ 3], res[ 11], res[ 14]) \ + COMP_MUL(tr[ 4], res[ 16], res[ 21], ti[ 4], res[ 17], res[ 20]) \ + COMP_MUL(tr[ 5], res[ 24], res[ 29], ti[ 5], res[ 25], res[ 28]) \ + COMP_MUL(tr[ 6], res[ 18], res[ 23], ti[ 6], res[ 19], res[ 22]) \ + COMP_MUL(tr[ 7], res[ 26], res[ 31], ti[ 7], res[ 27], res[ 30]) \ + COMP_MUL(tr[ 8], res[ 32], res[ 37], ti[ 8], res[ 33], res[ 36]) \ + COMP_MUL(tr[ 9], res[ 40], res[ 45], ti[ 9], res[ 41], res[ 44]) \ + COMP_MUL(tr[10], res[ 34], res[ 39], ti[10], res[ 35], res[ 38]) \ + COMP_MUL(tr[11], res[ 42], res[ 47], ti[11], res[ 43], res[ 46]) \ + COMP_MUL(tr[12], res[ 48], res[ 53], ti[12], res[ 49], res[ 52]) \ + COMP_MUL(tr[13], res[ 56], res[ 61], ti[13], res[ 57], res[ 60]) \ + COMP_MUL(tr[14], res[ 50], res[ 55], ti[14], res[ 51], res[ 54]) \ + COMP_MUL(tr[15], res[ 58], res[ 63], ti[15], res[ 59], res[ 62]) \ + COMP_MUL(tr[16], res[ 64], res[ 69], ti[16], res[ 65], res[ 68]) \ + COMP_MUL(tr[17], res[ 72], res[ 77], ti[17], res[ 73], res[ 76]) \ + COMP_MUL(tr[18], res[ 66], res[ 71], ti[18], res[ 67], res[ 70]) \ + COMP_MUL(tr[19], res[ 74], res[ 79], ti[19], res[ 75], res[ 78]) \ + COMP_MUL(tr[20], res[ 80], res[ 85], ti[20], res[ 81], res[ 84]) \ + COMP_MUL(tr[21], res[ 88], res[ 93], ti[21], res[ 89], res[ 92]) \ + COMP_MUL(tr[22], res[ 82], res[ 87], ti[22], res[ 83], res[ 86]) \ + COMP_MUL(tr[23], res[ 90], res[ 95], ti[23], res[ 91], res[ 94]) \ + COMP_MUL(tr[24], res[ 96], res[101], ti[24], res[ 97], res[100]) \ + COMP_MUL(tr[25], res[104], res[109], ti[25], res[105], res[108]) \ + COMP_MUL(tr[26], res[ 98], res[103], ti[26], res[ 99], res[102]) \ + COMP_MUL(tr[27], res[106], res[111], ti[27], res[107], res[110]) \ + COMP_MUL(tr[28], res[112], res[117], ti[28], res[113], res[116]) \ + COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124]) \ + COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118]) \ + COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126]) \ + CO[ 0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i; \ + CO[ 1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i; \ + CO[ 2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i; \ + CO[ 3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i; \ + CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i; \ + CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i; \ + CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i; \ + CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i; \ + CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i; \ + CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i; \ + CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i; \ + CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i; \ + CO[ 4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; \ + CO[ 5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; \ + CO[ 6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; \ + CO[ 7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; \ + CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i; \ + CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i; \ + CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i; \ + CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i; \ + CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i; \ + CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i; \ + CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i; \ + CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i; \ + CO[ 8] A_OP tr[16] * alpha_r - ti[16] * alpha_i; \ + CO[ 9] A_OP ti[16] * alpha_r + tr[16] * alpha_i; \ + CO[ 10] A_OP tr[17] * alpha_r - ti[17] * alpha_i; \ + CO[ 11] A_OP ti[17] * alpha_r + tr[17] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i; \ + CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i; \ + CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i; \ + CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i; \ + CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i; \ + CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i; \ + CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i; \ + CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i; \ + CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i; \ + CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i; \ + CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i; \ + CO[ 12] A_OP tr[24] * alpha_r - ti[24] * alpha_i; \ + CO[ 13] A_OP ti[24] * alpha_r + tr[24] * alpha_i; \ + CO[ 14] A_OP tr[25] * alpha_r - ti[25] * alpha_i; \ + CO[ 15] A_OP ti[25] * alpha_r + tr[25] * alpha_i; \ + CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i; \ + CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i; \ + CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i; \ + CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i; \ + CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i; \ + CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i; \ + CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i; \ + CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i; \ + CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i; \ + CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i; \ + CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i; \ + CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; + +#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4]) \ + COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12]) \ + COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6]) \ + COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14]) \ + COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20]) \ + COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28]) \ + COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22]) \ + COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_14 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) \ + COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \ + COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \ + COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \ + COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \ + COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \ + COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \ + COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \ + COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \ + COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \ + COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \ + COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \ + COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \ + COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \ + COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[64], ti[64]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 2); i1++) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 3; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2); + } + SAVE_ACC_COMPLEX_24_ALL + CO += 16; + AO += temp << 4; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4) + CO += 8; + AO += temp << 3; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0) + CO += 4; + AO += temp << 2; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_14 + CO += 2; + AO += temp << 1; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 3; + } + + if (n & 2) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 2; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_22_4 + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_22_2 + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_12 + AO += temp<<1; + BO += temp<<2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + + if (n & 1) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 1; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp<<1; + BO += temp<<1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c new file mode 100644 index 000000000..e4e609067 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.c @@ -0,0 +1,761 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined(TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ + COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ + COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ + COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ + COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ + COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ + COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ + COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ + COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[16], ti[16]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 1); i1++) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<2; + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); + } + __builtin_mma_disassemble_acc ((void *)result, &acc0); + __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); + __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); + COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) + COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) + COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) + COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) + COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) + COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) + COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) + COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) + COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) + COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) + COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) + COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) + COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) + COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) + COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) + COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; + CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; + CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; + CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; + CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; + CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; + CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; + CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; + CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; + CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; + CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; + CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; + + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_12 + AO += temp << 1; + BO += temp << 2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + if (n & 1) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<1; + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp << 1; + BO += temp << 1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} From 87ba528d8b12d8debf77b5b952b7c31d2ba1bd08 Mon Sep 17 00:00:00 2001 From: austinpagan Date: Thu, 1 Feb 2024 18:46:07 -0600 Subject: [PATCH 626/718] Changed C files to straighten out indentation. Removed commented lines from other file. --- kernel/power/KERNEL.POWER10 | 21 - kernel/power/cgemm_kernel_power10.c | 1725 +++++++++++++-------------- kernel/power/zgemm_kernel_power10.c | 1161 +++++++++--------- 3 files changed, 1411 insertions(+), 1496 deletions(-) diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 5f49b9c46..4e408e121 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -16,17 +16,8 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c -ifeq ($(OSNAME), AIX) -#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S -#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S -CTRMMKERNEL = cgemm_kernel_power10.c -ZTRMMKERNEL = zgemm_kernel_power10.c -else -#CTRMMKERNEL = cgemm_kernel_power10.S -#ZTRMMKERNEL = zgemm_kernel_power10.S CTRMMKERNEL = cgemm_kernel_power10.c ZTRMMKERNEL = zgemm_kernel_power10.c -endif SGEMMKERNEL = sgemm_kernel_power10.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c @@ -68,13 +59,7 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c -ifeq ($(OSNAME), AIX) -#CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMKERNEL = cgemm_kernel_power10.c -else -#CGEMMKERNEL = cgemm_kernel_power10.S -CGEMMKERNEL = cgemm_kernel_power10.c -endif #CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c ifeq ($(OSNAME), AIX) @@ -89,13 +74,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -ifeq ($(OSNAME), AIX) -#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMKERNEL = zgemm_kernel_power10.c -else -#ZGEMMKERNEL = zgemm_kernel_power10.S -ZGEMMKERNEL = zgemm_kernel_power10.c -endif ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/cgemm_kernel_power10.c b/kernel/power/cgemm_kernel_power10.c index 279c83aec..233768cef 100644 --- a/kernel/power/cgemm_kernel_power10.c +++ b/kernel/power/cgemm_kernel_power10.c @@ -31,15 +31,15 @@ typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); -#define SET_ACC_ZERO() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); \ - __builtin_mma_xxsetaccz (&acc4); \ - __builtin_mma_xxsetaccz (&acc5); \ - __builtin_mma_xxsetaccz (&acc6); \ - __builtin_mma_xxsetaccz (&acc7); +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); #if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } @@ -67,231 +67,231 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #define A_OP += #endif -#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - __builtin_mma_disassemble_acc ((void *)result, &acc0); \ - __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1); \ - __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2); \ - __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ - __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ - __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ - __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ - __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); -#define COMP_MUL_1 \ - COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) +#define COMP_MUL_1 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) -#define COMP_MAC_1(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ +#define COMP_MAC_1(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ } -#define COMP_MUL_2A \ - COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ - COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) +#define COMP_MUL_2A \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) -#define COMP_MAC_2A(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6]) \ +#define COMP_MAC_2A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6]) \ } -#define COMP_MUL_2B \ - COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ - COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12]) +#define COMP_MUL_2B \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12]) -#define COMP_MAC_2B(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ +#define COMP_MAC_2B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ } -#define COMP_MUL_4A(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ - COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ - COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +#define COMP_MUL_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ } -#define COMP_MAC_4A(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ - COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ - COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +#define COMP_MAC_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ } -#define COMP_MUL_4B(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ - COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ - COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +#define COMP_MUL_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ } -#define COMP_MAC_4B(_offset) { \ - FLOAT *_ro = &res[_offset]; \ - COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ - COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ - COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ - COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +#define COMP_MAC_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ } -#define SAVE_ACC_COMPLEX_11 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_1 \ - COMP_MAC_1(16) \ - COMP_MAC_1(32) \ - COMP_MAC_1(48) \ - COMP_MAC_1(64) \ - COMP_MAC_1(80) \ - COMP_MAC_1(96) \ - COMP_MAC_1(112) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_1 \ + COMP_MAC_1(16) \ + COMP_MAC_1(32) \ + COMP_MAC_1(48) \ + COMP_MAC_1(64) \ + COMP_MAC_1(80) \ + COMP_MAC_1(96) \ + COMP_MAC_1(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; -#define SAVE_ACC_COMPLEX_12 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_2A \ - COMP_MAC_2A(16) \ - COMP_MAC_2A(32) \ - COMP_MAC_2A(48) \ - COMP_MAC_2A(64) \ - COMP_MAC_2A(80) \ - COMP_MAC_2A(96) \ - COMP_MAC_2A(112) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2A \ + COMP_MAC_2A(16) \ + COMP_MAC_2A(32) \ + COMP_MAC_2A(48) \ + COMP_MAC_2A(64) \ + COMP_MAC_2A(80) \ + COMP_MAC_2A(96) \ + COMP_MAC_2A(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; -#define SAVE_ACC_COMPLEX_21_1 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_2B \ - COMP_MAC_2B(16) \ - COMP_MAC_2B(32) \ - COMP_MAC_2B(48) \ - COMP_MAC_2B(64) \ - COMP_MAC_2B(80) \ - COMP_MAC_2B(96) \ - COMP_MAC_2B(112) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2B \ + COMP_MAC_2B(16) \ + COMP_MAC_2B(32) \ + COMP_MAC_2B(48) \ + COMP_MAC_2B(64) \ + COMP_MAC_2B(80) \ + COMP_MAC_2B(96) \ + COMP_MAC_2B(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; -#define SAVE_ACC_COMPLEX_21_2 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_4A(0) \ - COMP_MAC_4A(32) \ - COMP_MAC_4A(64) \ - COMP_MAC_4A(96) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(32) \ + COMP_MAC_4A(64) \ + COMP_MAC_4A(96) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_21_4 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_4A(0) \ - COMP_MAC_4A(64) \ - CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - COMP_MUL_4A(32) \ - COMP_MAC_4A(96) \ - CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(64) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4A(32) \ + COMP_MAC_4A(96) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_22_4 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_4B(0) \ - CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - COMP_MUL_4B(16) \ - CO[ 4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[ 6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[ 7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - COMP_MUL_4B(32) \ - CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - COMP_MUL_4B(48) \ - CO[12] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[13] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[14] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[15] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_22_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[ 4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(32) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(48) \ + CO[12] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[13] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[14] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[15] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_22_2 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_4B(0) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - COMP_MUL_4B(16) \ - CO[4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_22_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_22_1 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL_4B(0) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_22_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_24_ALL \ +#define SAVE_ACC_COMPLEX_24_ALL \ __builtin_mma_disassemble_acc ((void *)result, &acc0); \ __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4); \ __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1); \ @@ -332,205 +332,205 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124]) \ COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118]) \ COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126]) \ - CO[ 0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i; \ - CO[ 1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i; \ - CO[ 2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i; \ - CO[ 3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i; \ - CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i; \ - CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i; \ - CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i; \ - CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i; \ - CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i; \ - CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i; \ - CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i; \ - CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i; \ - CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i; \ - CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i; \ - CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i; \ - CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i; \ - CO[ 4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; \ - CO[ 5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; \ - CO[ 6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; \ - CO[ 7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; \ - CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; \ - CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; \ - CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; \ - CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; \ - CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i; \ - CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i; \ - CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i; \ - CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i; \ - CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i; \ - CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i; \ - CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i; \ - CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i; \ - CO[ 8] A_OP tr[16] * alpha_r - ti[16] * alpha_i; \ - CO[ 9] A_OP ti[16] * alpha_r + tr[16] * alpha_i; \ - CO[ 10] A_OP tr[17] * alpha_r - ti[17] * alpha_i; \ - CO[ 11] A_OP ti[17] * alpha_r + tr[17] * alpha_i; \ - CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i; \ - CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i; \ - CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i; \ - CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i; \ - CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i; \ - CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i; \ - CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i; \ - CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i; \ - CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i; \ - CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i; \ - CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i; \ - CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i; \ - CO[ 12] A_OP tr[24] * alpha_r - ti[24] * alpha_i; \ - CO[ 13] A_OP ti[24] * alpha_r + tr[24] * alpha_i; \ - CO[ 14] A_OP tr[25] * alpha_r - ti[25] * alpha_i; \ - CO[ 15] A_OP ti[25] * alpha_r + tr[25] * alpha_i; \ - CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i; \ - CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i; \ - CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i; \ - CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i; \ - CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i; \ - CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i; \ - CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i; \ - CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i; \ - CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i; \ - CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i; \ - CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i; \ - CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; + CO[ 0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i; \ + CO[ 1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i; \ + CO[ 2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i; \ + CO[ 3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i; \ + CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i; \ + CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i; \ + CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i; \ + CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i; \ + CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i; \ + CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i; \ + CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i; \ + CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i; \ + CO[ 4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; \ + CO[ 5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; \ + CO[ 6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; \ + CO[ 7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; \ + CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i; \ + CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i; \ + CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i; \ + CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i; \ + CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i; \ + CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i; \ + CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i; \ + CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i; \ + CO[ 8] A_OP tr[16] * alpha_r - ti[16] * alpha_i; \ + CO[ 9] A_OP ti[16] * alpha_r + tr[16] * alpha_i; \ + CO[ 10] A_OP tr[17] * alpha_r - ti[17] * alpha_i; \ + CO[ 11] A_OP ti[17] * alpha_r + tr[17] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i; \ + CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i; \ + CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i; \ + CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i; \ + CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i; \ + CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i; \ + CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i; \ + CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i; \ + CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i; \ + CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i; \ + CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i; \ + CO[ 12] A_OP tr[24] * alpha_r - ti[24] * alpha_i; \ + CO[ 13] A_OP ti[24] * alpha_r + tr[24] * alpha_i; \ + CO[ 14] A_OP tr[25] * alpha_r - ti[25] * alpha_i; \ + CO[ 15] A_OP ti[25] * alpha_r + tr[25] * alpha_i; \ + CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i; \ + CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i; \ + CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i; \ + CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i; \ + CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i; \ + CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i; \ + CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i; \ + CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i; \ + CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i; \ + CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i; \ + CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i; \ + CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; -#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI) \ - __builtin_mma_disassemble_acc ((void *)result, ACC1); \ - __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ - COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4]) \ - COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12]) \ - COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6]) \ - COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14]) \ - COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20]) \ - COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28]) \ - COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22]) \ - COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30]) \ - CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ - CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ - CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ - CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ - CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ - CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ - CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ - CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i; +#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4]) \ + COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12]) \ + COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6]) \ + COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14]) \ + COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20]) \ + COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28]) \ + COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22]) \ + COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i; -#define SAVE_ACC_COMPLEX_14 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ - COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) \ - COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \ - COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \ - COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \ - COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \ - COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \ - COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \ - COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \ - COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \ - COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \ - COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \ - COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \ - COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \ - COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \ - COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_14 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) \ + COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \ + COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \ + COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \ + COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \ + COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \ + COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \ + COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \ + COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \ + COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \ + COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \ + COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \ + COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \ + COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \ + COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i; #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) #define REFRESH_TEMP_BK(x, y) \ - temp = k - off; + temp = k - off; #elif defined(LEFT) #define REFRESH_TEMP_BK(x, y) \ - temp = off + x; + temp = off + x; #else #define REFRESH_TEMP_BK(x, y) \ - temp = off + y; + temp = off + y; #endif #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #define REFRESH_POINTERS(x, y) \ - BO = B; \ - REFRESH_TEMP_BK(x, y) + BO = B; \ + REFRESH_TEMP_BK(x, y) #else #define REFRESH_POINTERS(x, y) \ - AO += off * (2*x); \ - BO = B + off * (2*y); \ - REFRESH_TEMP_BK(x, y) + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) #endif #ifdef LEFT #define REFRESH_OFF(x) \ - off += x; + off += x; #else #define REFRESH_OFF(x) #endif #ifdef LEFT #define UPDATE_TEMP(x, y) \ - temp -= x; + temp -= x; #else #define UPDATE_TEMP(x, y) \ - temp -= y; + temp -= y; #endif #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #define REFRESH_TMP_AFTER_SAVE(x, y) \ - temp = k - off; \ - UPDATE_TEMP(x, y) \ - AO += temp * (2*x); \ - BO += temp * (2*y); + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); #else #define REFRESH_TMP_AFTER_SAVE(x, y) #endif -#define REFRESH_AFTER_SAVE(x,y) \ - REFRESH_TMP_AFTER_SAVE(x, y) \ - REFRESH_OFF(x) +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) /************************************************************************************* * GEMM Kernel *************************************************************************************/ int -CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, - FLOAT * C, BLASLONG ldc #ifdef TRMMKERNEL - , BLASLONG offset +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) +#else +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) #endif - ) { - BLASLONG i1, i, l, temp; - FLOAT *AO, *BO, *CO; + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; #if defined(TRMMKERNEL) - BLASLONG off; + BLASLONG off; #endif #if defined(TRMMKERNEL) && !defined(LEFT) - off = -offset; + off = -offset; #endif - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - v4sf_t result[32]; - FLOAT *res, tr[64], ti[64]; - res = (FLOAT *) result; + v4sf_t result[32]; + FLOAT *res, tr[64], ti[64]; + res = (FLOAT *) result; - for (i1 = 0; i1 < (n >> 2); i1++) - { + for (i1 = 0; i1 < (n >> 2); i1++) { #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif @@ -538,193 +538,181 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * CO = C; C += ldc << 3; - for (i = 0; i < (m >> 3); i++) - { + for (i = 0; i < (m >> 3); i++) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 4); + REFRESH_POINTERS (8, 4); #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<4]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); - __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2); - __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2); - __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2); - } + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2); + } SAVE_ACC_COMPLEX_24_ALL - CO += 16; - AO += temp << 4; - BO += temp << 3; + CO += 16; + AO += temp << 4; + BO += temp << 3; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 4) + REFRESH_AFTER_SAVE (8, 4) #endif - } - if (m & 4) - { + } + if (m & 4) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 4); + REFRESH_POINTERS (4, 4); #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~1)); l+=2) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); - __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); - __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); - } - for (l = (temp & (~1)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); - __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); - } - SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0) - SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4) - CO += 8; - AO += temp << 3; - BO += temp << 3; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4) + CO += 8; + AO += temp << 3; + BO += temp << 3; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 4) + REFRESH_AFTER_SAVE (4, 4) #endif - } - if (m & 2) - { + } + if (m & 2) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 4); + REFRESH_POINTERS (2, 4); #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; - vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; - vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; - vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; - vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); - __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4); - __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5); - __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6); - __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7); - __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8); - } - for (l = (temp & (~3)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); - } - SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0) - CO += 4; - AO += temp << 2; - BO += temp << 3; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0) + CO += 4; + AO += temp << 2; + BO += temp << 3; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 4) + REFRESH_AFTER_SAVE (2, 4) #endif - } - if (m & 1) - { + } + if (m & 1) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 4) + REFRESH_POINTERS (1, 4) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; - vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; - vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; - vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; - vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; - vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; - vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); - __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3); - __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4); - __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5); - __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6); - __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7); - __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8); - } - for (l = (temp & (~3)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowB1 = *(vec_t *) & BO[l<<3]; - vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); - } - SAVE_ACC_COMPLEX_14 - CO += 2; - AO += temp << 1; - BO += temp << 3; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_14 + CO += 2; + AO += temp << 1; + BO += temp << 3; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 4) + REFRESH_AFTER_SAVE (1, 4) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) - off += 4; // number of values in A + off += 4; // number of values in A #endif B += k << 3; - } + } - if (n & 2) - { + if (n & 2) { #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif @@ -732,212 +720,199 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * CO = C; C += ldc << 2; - for (i = 0; i < (m >> 3); i++) - { + for (i = 0; i < (m >> 3); i++) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 2) + REFRESH_POINTERS (8, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~1)); l+=2) - { - vec_t rowA1 = *(vec_t *) & AO[l<<4]; - vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); - __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2); - __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2); - __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2); - __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2); - } - for (l = (temp & (~1)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<4]; - vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); - } - SAVE_ACC_COMPLEX_22_4 - AO += temp << 4; - BO += temp << 2; - CO += 16; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_22_4 + AO += temp << 4; + BO += temp << 2; + CO += 16; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 2) + REFRESH_AFTER_SAVE (8, 2) #endif - } - if (m & 4) - { + } + if (m & 4) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 2) + REFRESH_POINTERS (4, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2); - __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2); - __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3); - __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3); - __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4); - __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4); - } - for (l = (temp & (~3)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - } - SAVE_ACC_COMPLEX_22_2 - AO += temp << 3; - BO += temp << 2; - CO += 8; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_22_2 + AO += temp << 3; + BO += temp << 2; + CO += 8; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 2) + REFRESH_AFTER_SAVE (4, 2) #endif - } if (m & 2) - { + } + if (m & 2) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 2) + REFRESH_POINTERS (2, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; - vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; - vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; - vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; - vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); - __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4); - __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5); - __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6); - __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7); - __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_22_1 - AO += temp << 2; - BO += temp << 2; - CO += 4; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 2) + REFRESH_AFTER_SAVE (2, 2) #endif - } - if (m & 1) - { + } + if (m & 1) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 2) + REFRESH_POINTERS (1, 2) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - // RIP OUT MMA STUFF! - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; - vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; - vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; - vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; - vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; - vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; - vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; - vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; - vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; - vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; - vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); - __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); - __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); - __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); - __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_12 - AO += temp<<1; - BO += temp<<2; - CO += 2; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_12 + AO += temp<<1; + BO += temp<<2; + CO += 2; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 2) + REFRESH_AFTER_SAVE (1, 2) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; // number of values in A + off += 2; // number of values in A #endif B += k << 2; - } + } - if (n & 1) - { + if (n & 1) { #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif @@ -945,210 +920,196 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * CO = C; C += ldc << 1; - for (i = 0; i < (m >> 3); i++) - { + for (i = 0; i < (m >> 3); i++) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 1) + REFRESH_POINTERS (8, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~1)); l+=2) - { - vec_t rowA1 = *(vec_t *) & AO[l<<4]; - vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); - __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2); - __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2); - __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2); - __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2); - } - for (l = (temp & (~1)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<4]; - vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); - } - SAVE_ACC_COMPLEX_21_4 - AO += temp << 4; - BO += temp << 1; - CO += 16; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + AO += temp << 4; + BO += temp << 1; + CO += 16; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 1) + REFRESH_AFTER_SAVE (8, 1) #endif - } - if (m & 4) - { + } + if (m & 4) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 1) + REFRESH_POINTERS (4, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2); - __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3); - __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3); - __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4); - __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4); - } - for (l = (temp & (~3)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<3]; - vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); - } - SAVE_ACC_COMPLEX_21_2 - AO += temp << 3; - BO += temp << 1; - CO += 8; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 1) + REFRESH_AFTER_SAVE (4, 1) #endif - } - if (m & 2) - { + } + if (m & 2) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 1) + REFRESH_POINTERS (2, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - // RIP OUT MMA STUFF! - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; - vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; - vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; - vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; - vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; - vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; - vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); - __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); - __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); - __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); - __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<2]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_21_1 - AO += temp << 2; - BO += temp << 1; - CO += 4; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 1) + REFRESH_AFTER_SAVE (2, 1) #endif - } - if (m & 1) - { + } + if (m & 1) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 1) + REFRESH_POINTERS (1, 1) #else - BO = B; - temp = k; + BO = B; + temp = k; #endif - // RIP OUT MMA STUFF! - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; - vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; - vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; - vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; - vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; - vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; - vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); - __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); - __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); - __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); - __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); - __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); - __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - vec_t rowA1 = *(vec_t *) & AO[l<<1]; - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_11 - AO += temp<<1; - BO += temp<<1; - CO += 2; + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp<<1; + BO += temp<<1; + CO += 2; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 1) + REFRESH_AFTER_SAVE (1, 1) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; // number of values in A + off += 1; // number of values in A #endif B += k << 1; - } - return 0; + } + return 0; } diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c index e4e609067..370d12af3 100644 --- a/kernel/power/zgemm_kernel_power10.c +++ b/kernel/power/zgemm_kernel_power10.c @@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); -#define SET_ACC_ZERO() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); \ - __builtin_mma_xxsetaccz (&acc4); \ - __builtin_mma_xxsetaccz (&acc5); \ - __builtin_mma_xxsetaccz (&acc6); \ - __builtin_mma_xxsetaccz (&acc7); +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); #if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } @@ -66,696 +66,671 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); #define A_OP += #endif -#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - __builtin_mma_disassemble_acc ((void *)result, &acc0); \ - __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ - __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ - __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ - __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ - __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ - __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ - __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); -#define SAVE_ACC_COMPLEX_11 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ - COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ - COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ - COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ - COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ - COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ - COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ - COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; -#define SAVE_ACC_COMPLEX_12 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ - COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ - COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ - COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ - COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ - COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ - COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ - COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; -#define SAVE_ACC_COMPLEX_21_1 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ - COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ - COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ - COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ - COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ - COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ - COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ - COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ - COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ - COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ - COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ - COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ - COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ - COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ - COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ - COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; -#define SAVE_ACC_COMPLEX_21_2 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ - COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ - COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ - COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ - COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ - COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ - COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ - COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ - COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ - COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ - COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ - COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ - COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ - COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ - COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ - COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ + COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ + COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_21_4 \ - BUILTIN_MMA_DISASSEMBLE_ACC_8 \ - COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ - COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ - COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ - COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ - COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ - COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ - COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ - COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ - COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ - COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ - COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ - COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ - COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ - COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ - COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ - COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ - CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ - CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ - CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ - CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ - CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ - CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ - CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ - CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ - CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ + COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ + COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ + COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ + COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ + COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ + COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; -#define SAVE_ACC_COMPLEX_22_1 \ - __builtin_mma_disassemble_acc ((void *)result, &acc0); \ - __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ - COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ - COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ - COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ - COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ - CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_22_1 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; -#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ - __builtin_mma_disassemble_acc ((void *)result, ACC1); \ - __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ - COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ - COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ - COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ - COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ - CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ - CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ - CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ - CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ - CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ - CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ - CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ - CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; +#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) #define REFRESH_TEMP_BK(x, y) \ - temp = k - off; + temp = k - off; #elif defined(LEFT) #define REFRESH_TEMP_BK(x, y) \ - temp = off + x; + temp = off + x; #else #define REFRESH_TEMP_BK(x, y) \ - temp = off + y; + temp = off + y; #endif #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #define REFRESH_POINTERS(x, y) \ - BO = B; \ - REFRESH_TEMP_BK(x, y) + BO = B; \ + REFRESH_TEMP_BK(x, y) #else #define REFRESH_POINTERS(x, y) \ - AO += off * (2*x); \ - BO = B + off * (2*y); \ - REFRESH_TEMP_BK(x, y) + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) #endif #ifdef LEFT #define REFRESH_OFF(x) \ - off += x; + off += x; #else #define REFRESH_OFF(x) #endif #ifdef LEFT #define UPDATE_TEMP(x, y) \ - temp -= x; + temp -= x; #else #define UPDATE_TEMP(x, y) \ - temp -= y; + temp -= y; #endif #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #define REFRESH_TMP_AFTER_SAVE(x, y) \ - temp = k - off; \ - UPDATE_TEMP(x, y) \ - AO += temp * (2*x); \ - BO += temp * (2*y); + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); #else #define REFRESH_TMP_AFTER_SAVE(x, y) #endif -#define REFRESH_AFTER_SAVE(x,y) \ - REFRESH_TMP_AFTER_SAVE(x, y) \ - REFRESH_OFF(x) +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) /************************************************************************************* * GEMM Kernel *************************************************************************************/ int -CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, - FLOAT * C, BLASLONG ldc #ifdef TRMMKERNEL - , BLASLONG offset +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) +#else +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) #endif - ) { - BLASLONG i1, i, l, temp; - FLOAT *AO, *BO, *CO; + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; #if defined(TRMMKERNEL) - BLASLONG off; + BLASLONG off; #endif #if defined(TRMMKERNEL) && !defined(LEFT) - off = -offset; + off = -offset; #endif - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - v4sf_t result[32]; - FLOAT *res, tr[16], ti[16]; - res = (FLOAT *) result; + v4sf_t result[32]; + FLOAT *res, tr[16], ti[16]; + res = (FLOAT *) result; - for (i1 = 0; i1 < (n >> 1); i1++) - { + for (i1 = 0; i1 < (n >> 1); i1++) { #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif AO = A; CO = C; C += ldc<<2; - for (i = 0; i < (m >> 3); i++) - { + for (i = 0; i < (m >> 3); i++) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 2) + REFRESH_POINTERS (8, 2) #else - BO = B; - temp = k; -#endif - SET_ACC_ZERO() - for (l = 0; l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); - __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); - __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); - __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); - __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); - } - __builtin_mma_disassemble_acc ((void *)result, &acc0); - __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); - __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); - __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); - __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); - __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); - __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); - __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); - COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) - COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) - COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) - COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) - COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) - COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) - COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) - COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) - COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) - COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) - COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) - COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) - COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) - COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) - COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) - COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) - CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; - CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; - CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; - CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; - CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; - CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; - CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; - CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; - CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; - CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; - CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; - CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; - CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; - CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; - CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; - CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; - CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; - CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; - CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; - CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; - CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; - CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; - CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; - CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; - CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; - CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; - CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; - CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; - CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; - CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; - CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; - CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); + } + __builtin_mma_disassemble_acc ((void *)result, &acc0); + __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); + __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); + COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) + COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) + COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) + COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) + COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) + COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) + COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) + COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) + COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) + COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) + COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) + COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) + COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) + COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) + COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) + COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; + CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; + CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; + CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; + CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; + CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; + CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; + CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; + CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; + CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; + CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; + CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; - AO += temp << 4; - BO += temp << 2; - CO += 16; + AO += temp << 4; + BO += temp << 2; + CO += 16; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 2) + REFRESH_AFTER_SAVE (8, 2) #endif - } - if (m & 4) - { + } + if (m & 4) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 2) + REFRESH_POINTERS (4, 2) #else - BO = B; - temp = k; -#endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~1)); l+=2) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); - __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); - __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); - __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); - } - for (l = (temp & (~1)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); - __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); - } - SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) - SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) - AO += temp << 3; - BO += temp << 2; - CO += 8; + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) + AO += temp << 3; + BO += temp << 2; + CO += 8; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 2) + REFRESH_AFTER_SAVE (4, 2) #endif - } - if (m & 2) - { + } + if (m & 2) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 2) + REFRESH_POINTERS (2, 2) #else - BO = B; - temp = k; -#endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); - __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); - __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); - __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); - __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); - __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); - } - for (l = (temp & (~3)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); - } - SAVE_ACC_COMPLEX_22_1 - AO += temp << 2; - BO += temp << 2; - CO += 4; + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 2) + REFRESH_AFTER_SAVE (2, 2) #endif - } - if (m & 1) - { + } + if (m & 1) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 2) + REFRESH_POINTERS (1, 2) #else - BO = B; - temp = k; -#endif - // RIP OUT MMA STUFF! - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); - __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); - __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); - __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); - __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); - __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); - } - for (l = (temp & (~3)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); - vec_t rowB1 = *(vec_t *) & BO[l<<2]; - vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); - } - SAVE_ACC_COMPLEX_12 - AO += temp << 1; - BO += temp << 2; - CO += 2; + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_12 + AO += temp << 1; + BO += temp << 2; + CO += 2; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 2) + REFRESH_AFTER_SAVE (1, 2) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) off += 2; // number of values in A #endif B += k << 2; - } - if (n & 1) - { + } + if (n & 1) { #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif AO = A; CO = C; C += ldc<<1; - for (i = 0; i < (m >> 3); i++) - { + for (i = 0; i < (m >> 3); i++) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 1) + REFRESH_POINTERS (8, 1) #else - BO = B; - temp = k; -#endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~1)); l+=2) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); - __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); - __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); - __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); - __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); - } - for (l = (temp & (~1)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); - } - SAVE_ACC_COMPLEX_21_4 + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 - AO += temp << 4; - BO += temp << 1; - CO += 16; + AO += temp << 4; + BO += temp << 1; + CO += 16; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 1) + REFRESH_AFTER_SAVE (8, 1) #endif - } - if (m & 4) - { + } + if (m & 4) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 1) + REFRESH_POINTERS (4, 1) #else - BO = B; - temp = k; -#endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~3)); l+=4) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); - __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); - __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); - __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); - __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); - } - for (l = (temp & (~3)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); - } - SAVE_ACC_COMPLEX_21_2 - AO += temp << 3; - BO += temp << 1; - CO += 8; + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 1) + REFRESH_AFTER_SAVE (4, 1) #endif - } if (m & 2) - { + } + if (m & 2) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 1) + REFRESH_POINTERS (2, 1) #else - BO = B; - temp = k; -#endif - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); - __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); - __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); - __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); - __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_21_1 - AO += temp << 2; - BO += temp << 1; - CO += 4; + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 1) + REFRESH_AFTER_SAVE (2, 1) #endif - } - if (m & 1) - { + } + if (m & 1) { #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 1) + REFRESH_POINTERS (1, 1) #else - BO = B; - temp = k; -#endif - // RIP OUT MMA STUFF! - SET_ACC_ZERO() - for (l = 0; l < (temp & (~7)); l+=8) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; - vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; - vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; - vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; - vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; - vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; - vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); - __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); - __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); - __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); - __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); - __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); - __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); - } - for (l = (temp & (~7)); l < temp; ++l) - { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); - vec_t rowB1 = *(vec_t *) & BO[l<<1]; - __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); - } - SAVE_ACC_COMPLEX_11 - AO += temp << 1; - BO += temp << 1; - CO += 2; + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp << 1; + BO += temp << 1; + CO += 2; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 1) + REFRESH_AFTER_SAVE (1, 1) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) off += 1; // number of values in A #endif B += k << 1; - } - return 0; + } + return 0; } From f68e9989c47d2643fd72986afc59f6a8d7d07486 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Fri, 2 Feb 2024 12:26:23 +0300 Subject: [PATCH 627/718] Remove zero rows/columns matcopy tests --- utest/test_extensions/test_cimatcopy.c | 32 -------------------------- utest/test_extensions/test_comatcopy.c | 32 -------------------------- utest/test_extensions/test_dimatcopy.c | 32 -------------------------- utest/test_extensions/test_domatcopy.c | 32 -------------------------- utest/test_extensions/test_simatcopy.c | 32 -------------------------- utest/test_extensions/test_somatcopy.c | 32 -------------------------- utest/test_extensions/test_zimatcopy.c | 32 -------------------------- utest/test_extensions/test_zomatcopy.c | 32 -------------------------- 8 files changed, 256 deletions(-) diff --git a/utest/test_extensions/test_cimatcopy.c b/utest/test_extensions/test_cimatcopy.c index 800f8a2d1..a4b1e30ac 100644 --- a/utest/test_extensions/test_cimatcopy.c +++ b/utest/test_extensions/test_cimatcopy.c @@ -714,38 +714,6 @@ CTEST(cimatcopy, xerbla_invalid_trans) ASSERT_EQUAL(TRUE, passed); } -/** - * Test error function for an invalid param m. - * Must be positive. - */ -CTEST(cimatcopy, xerbla_invalid_rows) -{ - blasint m = 0, n = 100; - blasint lda_src = 0, lda_dst = 100; - char order = 'C'; - char trans = 'T'; - int expected_info = 3; - - int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - -/** - * Test error function for an invalid param n. - * Must be positive. - */ -CTEST(cimatcopy, xerbla_invalid_cols) -{ - blasint m = 100, n = 0; - blasint lda_src = 100, lda_dst = 0; - char order = 'C'; - char trans = 'T'; - int expected_info = 4; - - int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - /** * Test error function for an invalid param lda_src. * If matrices are stored using row major layout, diff --git a/utest/test_extensions/test_comatcopy.c b/utest/test_extensions/test_comatcopy.c index 8a3d5ee7b..71663406a 100644 --- a/utest/test_extensions/test_comatcopy.c +++ b/utest/test_extensions/test_comatcopy.c @@ -524,38 +524,6 @@ CTEST(comatcopy, xerbla_invalid_trans) ASSERT_EQUAL(TRUE, passed); } -/** - * Test error function for an invalid param m. - * Must be positive. - */ -CTEST(comatcopy, xerbla_invalid_rows) -{ - blasint m = 0, n = 100; - blasint lda = 0, ldb = 100; - char order = 'C'; - char trans = 'T'; - int expected_info = 3; - - int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - -/** - * Test error function for an invalid param n. - * Must be positive. - */ -CTEST(comatcopy, xerbla_invalid_cols) -{ - blasint m = 100, n = 0; - blasint lda = 100, ldb = 0; - char order = 'C'; - char trans = 'T'; - int expected_info = 4; - - int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - /** * Test error function for an invalid param lda. * If matrices are stored using row major layout, diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c index 4debb50e8..d2a16bbbf 100644 --- a/utest/test_extensions/test_dimatcopy.c +++ b/utest/test_extensions/test_dimatcopy.c @@ -811,38 +811,6 @@ CTEST(dimatcopy, xerbla_invalid_trans) ASSERT_EQUAL(TRUE, passed); } -/** - * Test error function for an invalid param m. - * Must be positive. - */ -CTEST(dimatcopy, xerbla_invalid_rows) -{ - blasint m = 0, n = 100; - blasint lda_src = 0, lda_dst = 100; - char order = 'C'; - char trans = 'T'; - int expected_info = 3; - - int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - -/** - * Test error function for an invalid param n. - * Must be positive. - */ -CTEST(dimatcopy, xerbla_invalid_cols) -{ - blasint m = 100, n = 0; - blasint lda_src = 100, lda_dst = 0; - char order = 'C'; - char trans = 'T'; - int expected_info = 4; - - int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - /** * Test error function for an invalid param lda_src. * If matrices are stored using row major layout, diff --git a/utest/test_extensions/test_domatcopy.c b/utest/test_extensions/test_domatcopy.c index f692e8784..e60b9c83d 100644 --- a/utest/test_extensions/test_domatcopy.c +++ b/utest/test_extensions/test_domatcopy.c @@ -536,38 +536,6 @@ CTEST(domatcopy, xerbla_invalid_trans) ASSERT_EQUAL(TRUE, passed); } -/** - * Test error function for an invalid param m. - * Must be positive. - */ -CTEST(domatcopy, xerbla_invalid_rows) -{ - blasint m = 0, n = 100; - blasint lda = 0, ldb = 100; - char order = 'C'; - char trans = 'T'; - int expected_info = 3; - - int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - -/** - * Test error function for an invalid param n. - * Must be positive. - */ -CTEST(domatcopy, xerbla_invalid_cols) -{ - blasint m = 100, n = 0; - blasint lda = 100, ldb = 0; - char order = 'C'; - char trans = 'T'; - int expected_info = 4; - - int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - /** * Test error function for an invalid param lda. * If matrices are stored using row major layout, diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c index 0d9c44e73..cf14d360c 100644 --- a/utest/test_extensions/test_simatcopy.c +++ b/utest/test_extensions/test_simatcopy.c @@ -811,38 +811,6 @@ CTEST(simatcopy, xerbla_invalid_trans) ASSERT_EQUAL(TRUE, passed); } -/** - * Test error function for an invalid param m. - * Must be positive. - */ -CTEST(simatcopy, xerbla_invalid_rows) -{ - blasint m = 0, n = 100; - blasint lda_src = 0, lda_dst = 100; - char order = 'C'; - char trans = 'T'; - int expected_info = 3; - - int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - -/** - * Test error function for an invalid param n. - * Must be positive. - */ -CTEST(simatcopy, xerbla_invalid_cols) -{ - blasint m = 100, n = 0; - blasint lda_src = 100, lda_dst = 0; - char order = 'C'; - char trans = 'T'; - int expected_info = 4; - - int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - /** * Test error function for an invalid param lda_src. * If matrices are stored using row major layout, diff --git a/utest/test_extensions/test_somatcopy.c b/utest/test_extensions/test_somatcopy.c index c75bbc75e..b53c7cae5 100644 --- a/utest/test_extensions/test_somatcopy.c +++ b/utest/test_extensions/test_somatcopy.c @@ -536,38 +536,6 @@ CTEST(somatcopy, xerbla_invalid_trans) ASSERT_EQUAL(TRUE, passed); } -/** - * Test error function for an invalid param m. - * Must be positive. - */ -CTEST(somatcopy, xerbla_invalid_rows) -{ - blasint m = 0, n = 100; - blasint lda = 0, ldb = 100; - char order = 'C'; - char trans = 'T'; - int expected_info = 3; - - int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - -/** - * Test error function for an invalid param n. - * Must be positive. - */ -CTEST(somatcopy, xerbla_invalid_cols) -{ - blasint m = 100, n = 0; - blasint lda = 100, ldb = 0; - char order = 'C'; - char trans = 'T'; - int expected_info = 4; - - int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - /** * Test error function for an invalid param lda. * If matrices are stored using row major layout, diff --git a/utest/test_extensions/test_zimatcopy.c b/utest/test_extensions/test_zimatcopy.c index 6461ce88f..8376bc493 100644 --- a/utest/test_extensions/test_zimatcopy.c +++ b/utest/test_extensions/test_zimatcopy.c @@ -714,38 +714,6 @@ CTEST(zimatcopy, xerbla_invalid_trans) ASSERT_EQUAL(TRUE, passed); } -/** - * Test error function for an invalid param m. - * Must be positive. - */ -CTEST(zimatcopy, xerbla_invalid_rows) -{ - blasint m = 0, n = 100; - blasint lda_src = 0, lda_dst = 100; - char order = 'C'; - char trans = 'T'; - int expected_info = 3; - - int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - -/** - * Test error function for an invalid param n. - * Must be positive. - */ -CTEST(zimatcopy, xerbla_invalid_cols) -{ - blasint m = 100, n = 0; - blasint lda_src = 100, lda_dst = 0; - char order = 'C'; - char trans = 'T'; - int expected_info = 4; - - int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - /** * Test error function for an invalid param lda_src. * If matrices are stored using row major layout, diff --git a/utest/test_extensions/test_zomatcopy.c b/utest/test_extensions/test_zomatcopy.c index 8df3dd80f..495831c56 100644 --- a/utest/test_extensions/test_zomatcopy.c +++ b/utest/test_extensions/test_zomatcopy.c @@ -541,38 +541,6 @@ CTEST(zomatcopy, xerbla_invalid_trans) ASSERT_EQUAL(TRUE, passed); } -/** -* Test error function for an invalid param m. -* Must be positive. -*/ -CTEST(zomatcopy, xerbla_invalid_rows) -{ - blasint m = 0, n = 100; - blasint lda = 0, ldb = 100; - char order = 'C'; - char trans = 'T'; - int expected_info = 3; - - int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - -/** -* Test error function for an invalid param n. -* Must be positive. -*/ -CTEST(zomatcopy, xerbla_invalid_cols) -{ - blasint m = 100, n = 0; - blasint lda = 100, ldb = 0; - char order = 'C'; - char trans = 'T'; - int expected_info = 4; - - int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info); - ASSERT_EQUAL(TRUE, passed); -} - /** * Test error function for an invalid param lda. * If matrices are stored using row major layout, From 441339104f5056ba8d68343667511de683d69ca6 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Fri, 2 Feb 2024 13:49:39 +0300 Subject: [PATCH 628/718] fix test ext cmake build --- utest/CMakeLists.txt | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index d78701707..c090ed511 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -22,6 +22,7 @@ endif () set(DIR_EXT test_extensions) set(OpenBLAS_utest_ext_src utest_main.c +${DIR_EXT}/common.c ${DIR_EXT}/xerbla.c ${DIR_EXT}/test_isamin.c ${DIR_EXT}/test_idamin.c @@ -39,14 +40,14 @@ ${DIR_EXT}/test_scamax.c ${DIR_EXT}/test_dzamax.c ${DIR_EXT}/test_zrotg.c ${DIR_EXT}/test_crotg.c -$(DIR_EXT)/test_drotmg.c -$(DIR_EXT)/test_srotmg.c -$(DIR_EXT)/test_zscal.c -$(DIR_EXT)/test_cscal.c -$(DIR_EXT)/test_domatcopy.c -$(DIR_EXT)/test_somatcopy.c -$(DIR_EXT)/test_zomatcopy.c -$(DIR_EXT)/test_comatcopy.c +${DIR_EXT}/test_drotmg.c +${DIR_EXT}/test_srotmg.c +${DIR_EXT}/test_zscal.c +${DIR_EXT}/test_cscal.c +${DIR_EXT}/test_domatcopy.c +${DIR_EXT}/test_somatcopy.c +${DIR_EXT}/test_zomatcopy.c +${DIR_EXT}/test_comatcopy.c ${DIR_EXT}/test_simatcopy.c ${DIR_EXT}/test_dimatcopy.c ${DIR_EXT}/test_cimatcopy.c @@ -59,12 +60,12 @@ ${DIR_EXT}/test_saxpby.c ${DIR_EXT}/test_daxpby.c ${DIR_EXT}/test_caxpby.c ${DIR_EXT}/test_zaxpby.c -${DIR_EXT}/test_caxpyc.c -${DIR_EXT}/test_zaxpyc.c -$(DIR_EXT)/test_cgemv_t.c -$(DIR_EXT)/test_zgemv_t.c -$(DIR_EXT)/test_cgemv_n.c -$(DIR_EXT)/test_zgemv_n.c +# ${DIR_EXT}/test_caxpyc.c +# ${DIR_EXT}/test_zaxpyc.c +${DIR_EXT}/test_cgemv_t.c +${DIR_EXT}/test_zgemv_t.c +${DIR_EXT}/test_cgemv_n.c +${DIR_EXT}/test_zgemv_n.c ${DIR_EXT}/test_crot.c ${DIR_EXT}/test_zrot.c ${DIR_EXT}/test_cgbmv.c @@ -75,10 +76,10 @@ ${DIR_EXT}/test_cgemmt.c ${DIR_EXT}/test_zgemmt.c ${DIR_EXT}/test_ztrmv.c ${DIR_EXT}/test_ctrmv.c -$(DIR_EXT)/test_ztrsv.c -$(DIR_EXT)/test_ctrsv.c -$(DIR_EXT)/test_zgemm.c -$(DIR_EXT)/test_cgemm.c +${DIR_EXT}/test_ztrsv.c +${DIR_EXT}/test_ctrsv.c +${DIR_EXT}/test_zgemm.c +${DIR_EXT}/test_cgemm.c ) # crashing on travis cl with an error code suggesting resource not found From b6949ce74c8a22f4ec2b710ac084411f6b6560b0 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Fri, 2 Feb 2024 14:42:27 +0300 Subject: [PATCH 629/718] add axpyc to cmake build --- utest/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index c090ed511..67dc489d1 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -60,8 +60,8 @@ ${DIR_EXT}/test_saxpby.c ${DIR_EXT}/test_daxpby.c ${DIR_EXT}/test_caxpby.c ${DIR_EXT}/test_zaxpby.c -# ${DIR_EXT}/test_caxpyc.c -# ${DIR_EXT}/test_zaxpyc.c +${DIR_EXT}/test_caxpyc.c +${DIR_EXT}/test_zaxpyc.c ${DIR_EXT}/test_cgemv_t.c ${DIR_EXT}/test_zgemv_t.c ${DIR_EXT}/test_cgemv_n.c From 4d8dee508ce415743d61042111a9d50660b2b8bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Feb 2024 01:05:03 +0100 Subject: [PATCH 630/718] temporarily disable the CAXPY/ZAXPY kernels --- kernel/riscv64/KERNEL.C910V | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 2798a870e..066329390 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c SAXPYKERNEL = axpy_vector.c DAXPYKERNEL = axpy_vector.c -CAXPYKERNEL = zaxpy_vector.c -ZAXPYKERNEL = zaxpy_vector.c +CAXPYKERNEL = zaxpy.c +ZAXPYKERNEL = zaxpy.c SAXPBYKERNEL = axpby_vector.c DAXPBYKERNEL = axpby_vector.c From 68d354814f9f846338e1988c4f609c8add419012 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Feb 2024 01:14:22 +0100 Subject: [PATCH 631/718] Fix incompatible pointer type in BFLOAT16 mode --- interface/gemmt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index 01dec0c35..8fd8089d0 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -527,7 +527,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif // for alignment buffer_size = (buffer_size + 3) & ~3; - STACK_ALLOC(buffer_size, FLOAT, buffer); + STACK_ALLOC(buffer_size, IFLOAT, buffer); #ifdef SMP @@ -616,7 +616,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif // for alignment buffer_size = (buffer_size + 3) & ~3; - STACK_ALLOC(buffer_size, FLOAT, buffer); + STACK_ALLOC(buffer_size, IFLOAT, buffer); #ifdef SMP @@ -666,4 +666,4 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_END; return; -} \ No newline at end of file +} From 3597827c932b90be8916f5fbaa27dd8db818ee9e Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 1 Feb 2024 16:33:58 +0800 Subject: [PATCH 632/718] utest: add axpby --- utest/CMakeLists.txt | 1 + utest/Makefile | 2 +- utest/test_axpby.c | 320 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 322 insertions(+), 1 deletion(-) create mode 100644 utest/test_axpby.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 41829bd22..edfcfb7cf 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -17,6 +17,7 @@ else () test_swap.c test_zscal.c test_amin.c + test_axpby.c ) endif () diff --git a/utest/Makefile b/utest/Makefile index 8acaa3ea9..867f8c88c 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -12,7 +12,7 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ - test_amin.o + test_amin.o test_axpby.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_axpby.c b/utest/test_axpby.c new file mode 100644 index 000000000..37ba8ad14 --- /dev/null +++ b/utest/test_axpby.c @@ -0,0 +1,320 @@ +/***************************************************************************** +Copyright (c) 2011-2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +#ifdef BUILD_SINGLE +CTEST(axpby, saxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + float alpha = 1.0, beta = 2.0; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, saxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + float alpha = 0.25, beta = 0.75; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, saxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + float alpha = 0.25, beta = 0.75; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00, + 9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00, + 7.75, 10.00 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} +#endif + +#ifdef BUILD_DOUBLE +CTEST(axpby, daxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + double alpha = 1.0, beta = 2.0; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, daxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + double alpha = 0.25, beta = 0.75; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 }; + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, daxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + double alpha = 0.25, beta = 0.75; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00, + 9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00, + 7.75, 10.00 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} +#endif + +#ifdef BUILD_COMPLEX +CTEST(axpby, caxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + float alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, + 10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, caxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5, + 8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} + +CTEST(axpby, caxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(caxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + float y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0, + 11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5, + 2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0, + 2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 }; + + for(i = 0; i < 4 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); + } +} +#endif + +#ifdef BUILD_COMPLEX16 +CTEST(axpby, zaxpby_inc_0) +{ + blasint i; + blasint N = 9, incX = 0, incY = 0; + double alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, + 10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, zaxpby_inc_1) +{ + blasint i; + blasint N = 9, incX = 1, incY = 1; + double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5, + 8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 }; + + for(i = 0; i < 2 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + +CTEST(axpby, zaxpby_inc_2) +{ + blasint i; + blasint N = 9, incX = 2, incY = 2; + double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 }; + double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0, + 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 }; + + BLASFUNC(zaxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY); + + double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0, + 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 }; + double y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0, + 11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5, + 2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0, + 2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 }; + + for(i = 0; i < 4 * N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} +#endif From 1e1f487dc7e95c7e5a1d4234147439a5f4bca070 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 1 Feb 2024 19:57:05 +0800 Subject: [PATCH 633/718] LoongArch64: Fixed {s/d}axpby --- kernel/loongarch64/axpby_lasx.S | 9 +++++++-- kernel/loongarch64/axpby_lsx.S | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/loongarch64/axpby_lasx.S b/kernel/loongarch64/axpby_lasx.S index f1d99cd3b..7a246ca5c 100644 --- a/kernel/loongarch64/axpby_lasx.S +++ b/kernel/loongarch64/axpby_lasx.S @@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE bge $r0, N, .L999 - li.d TEMP, 1 movgr2fr.d a1, $r0 ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT MTG t1, ALPHA @@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvreplgr2vr.w VXB, t2 xvreplgr2vr.w VXZ, t3 #endif + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 diff --git a/kernel/loongarch64/axpby_lsx.S b/kernel/loongarch64/axpby_lsx.S index 45154c262..e50d4cdcc 100644 --- a/kernel/loongarch64/axpby_lsx.S +++ b/kernel/loongarch64/axpby_lsx.S @@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE bge $r0, N, .L999 - li.d TEMP, 1 movgr2fr.d a1, $r0 ffint.s.l a1, a1 - slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT MTG t1, ALPHA @@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vreplgr2vr.w VXB, t2 vreplgr2vr.w VXZ, t3 #endif + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, BASE_SHIFT srai.d I, N, 3 bne INCX, TEMP, .L20 bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 From 7bc93d95a1d72fe733e43aa1bea64796b123dcbb Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 24 Jan 2024 16:11:45 +0800 Subject: [PATCH 634/718] LoongArch64: Opt {c/z}axpby --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 2 + kernel/loongarch64/KERNEL.LOONGSON3R5 | 2 + kernel/loongarch64/caxpby_lasx.S | 1046 ++++++++++++++++++++++ kernel/loongarch64/caxpby_lsx.S | 1029 +++++++++++++++++++++ 4 files changed, 2079 insertions(+) create mode 100644 kernel/loongarch64/caxpby_lasx.S create mode 100644 kernel/loongarch64/caxpby_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index e27ce3bee..f4ab495e6 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -58,6 +58,8 @@ ZAXPYKERNEL = caxpy_lsx.S SAXPBYKERNEL = axpby_lsx.S DAXPBYKERNEL = axpby_lsx.S +CAXPBYKERNEL = caxpby_lsx.S +ZAXPBYKERNEL = caxpby_lsx.S SSUMKERNEL = sum_lsx.S DSUMKERNEL = sum_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index f4429cfba..bd85fab01 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -58,6 +58,8 @@ ZAXPYKERNEL = caxpy_lasx.S SAXPBYKERNEL = axpby_lasx.S DAXPBYKERNEL = axpby_lasx.S +CAXPBYKERNEL = caxpby_lasx.S +ZAXPBYKERNEL = caxpby_lasx.S SSUMKERNEL = sum_lasx.S DSUMKERNEL = sum_lasx.S diff --git a/kernel/loongarch64/caxpby_lasx.S b/kernel/loongarch64/caxpby_lasx.S new file mode 100644 index 000000000..c5802092e --- /dev/null +++ b/kernel/loongarch64/caxpby_lasx.S @@ -0,0 +1,1046 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r5 +#define INCX $r6 +#define BETAR $f2 +#define BETAI $f3 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $xr8 +#define VX1 $xr20 +#define VX2 $xr21 +#define VX3 $xr22 +#define VXAR $xr23 +#define VXAI $xr19 +#define VXBR $xr14 +#define VXBI $xr13 +#define VXZ $xr12 +#define x1 $xr18 +#define x2 $xr17 +#define x3 $xr16 +#define x4 $xr15 + + PROLOGUE + + bge $r0, N, .L999 + movgr2fr.d a1, $r0 + FFINT a1, a1 + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT + MTG t1, ALPHAR + MTG t2, ALPHAI + MTG t3, BETAR + MTG t4, BETAI +#ifdef DOUBLE + xvreplgr2vr.d VXAR, t1 + xvreplgr2vr.d VXAI, t2 + xvreplgr2vr.d VXBR, t3 + xvreplgr2vr.d VXBI, t4 +#else + xvreplgr2vr.w VXAR, t1 + xvreplgr2vr.w VXAI, t2 + xvreplgr2vr.w VXBR, t3 + xvreplgr2vr.w VXBI, t4 +#endif + xvxor.v VXZ, VXZ, VXZ + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT +#ifdef DOUBLE + srai.d I, N, 2 +#else + srai.d I, N, 3 +#endif + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 + CMPEQ $fcc0, BETAR, a1 + CMPEQ $fcc1, BETAI, a1 + CMPEQ $fcc2, ALPHAR, a1 + CMPEQ $fcc3, ALPHAI, a1 + bceqz $fcc0, .L13 + bceqz $fcc1, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc2, .L114 + bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + +.L14: + bceqz $fcc2, .L112 + bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + xvst VXZ, Y, 0 * SIZE +#ifdef DOUBLE + xvst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvst VXZ, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 + +.L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFMUL x3, VXAI, x2 + XVFMUL x4, VXAI, x1 + XVMSUB x3, VXAR, x1, x3 + XVFMADD x4, VXAR, x2, x4 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 + +.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvld VX0, Y, 0 * SIZE + xvld VX1, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 +#else + xvld VX0, Y, 0 * SIZE + xvld VX1, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 +#endif + XVFMUL x3, VXBI, x2 + XVFMUL x4, VXBI, x1 + XVMSUB x3, VXBR, x1, x3 + XVFMADD x4, VXBR, x2, x4 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 + +.L114: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + xvld VX1, X, 4 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 4 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 +#else + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvld VX2, Y, 0 * SIZE + xvld VX3, Y, 8 * SIZE + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 +#endif + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 +#ifdef DOUBLE + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE +#else + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d X, X, 16 * SIZE + addi.d Y, Y, 16 * SIZE +#endif + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 2 + xvinsgr2vr.d x4, t4, 2 + + xvld VX1, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 1 + xvinsgr2vr.d x4, t2, 1 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#else + xvld VX0, X, 0 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + xvld VX1, X, 8 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + add.d Y, Y, INCY + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + xvpickev.w x1, VX1, VX0 + xvpickod.w x2, VX1, VX0 + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + addi.d X, X, 16 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#endif + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 2 + xvinsgr2vr.d x2, t4, 2 + xvld VX3, Y, 4 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 1 + xvinsgr2vr.d x2, t2, 1 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + + xvpickev.d x3, VX3, VX2 + xvpickod.d x4, VX3, VX2 + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + xvilvl.d VX2, x4 ,x3 + xvilvh.d VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#else + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + xvld VX3, Y, 8 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + + xvpickev.w x3, VX3, VX2 + xvpickod.w x4, VX3, VX2 + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + xvilvl.w VX2, x4 ,x3 + xvilvh.w VX3, x4, x3 + addi.d I, I, -1 + xvst VX2, Y, 0 * SIZE + xvst VX3, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#endif + +.L22: + bge $r0, I, .L997 + move YY, Y + CMPEQ $fcc0, BETAR, a1 + CMPEQ $fcc1, BETAI, a1 + CMPEQ $fcc2, ALPHAR, a1 + CMPEQ $fcc3, ALPHAI, a1 + bceqz $fcc0, .L23 + bceqz $fcc1, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc2, .L224 + bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L24: + bceqz $fcc2, .L222 + bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.d VXZ, Y, 0, 0 + xvstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#else + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, YY, 0, 0 + xvstelm.w VXZ, YY, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + xvstelm.w VXZ, Y, 0, 0 + xvstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#endif + +.L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d X, X, INCX + xvfmul.d x3, VXAI, x2 + xvfmul.d x4, VXAI, x1 + xvfmsub.d x3, VXAR, x1, x3 + xvfmadd.d x4, VXAR, x2, x4 + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#else + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d X, X, INCX + XVFMUL x3, VXAI, x2 + XVFMUL x4, VXAI, x1 + XVMSUB x3, VXAR, x1, x3 + XVFMADD x4, VXAR, x2, x4 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#endif + +.L223: +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + add.d Y, Y, INCY + xvfmul.d x3, VXBI, x2 + xvfmul.d x4, VXBI, x1 + xvfmsub.d x3, VXBR, x1, x3 + xvfmadd.d x4, VXBR, x2, x4 + + addi.d I, I, -1 + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#else + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + add.d Y, Y, INCY + + XVFMUL x3, VXBI, x2 + XVFMUL x4, VXBI, x1 + XVMSUB x3, VXBR, x1, x3 + XVFMADD x4, VXBR, x2, x4 + addi.d I, I, -1 + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#endif + +.L224: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.d x1, t1, 2 + xvinsgr2vr.d x2, t2, 2 + xvinsgr2vr.d x1, t3, 3 + xvinsgr2vr.d x2, t4, 3 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.d x3, t1, 0 + xvinsgr2vr.d x4, t2, 0 + xvinsgr2vr.d x3, t3, 1 + xvinsgr2vr.d x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.d x3, t1, 2 + xvinsgr2vr.d x4, t2, 2 + xvinsgr2vr.d x3, t3, 3 + xvinsgr2vr.d x4, t4, 3 + add.d Y, Y, INCY + xvfmul.d VX0, VXAI, x2 + xvfmul.d VX1, VXAI, x1 + xvfmul.d VX2, VXBI, x4 + xvfmul.d VX3, VXBI, x3 + xvfmsub.d VX0, VXAR, x1, VX0 + xvfmadd.d VX1, VXAR, x2, VX1 + xvfmsub.d VX2, VXBR, x3, VX2 + xvfmadd.d VX3, VXBR, x4, VX3 + xvfadd.d x3, VX0, VX2 + xvfadd.d x4, VX1, VX3 + addi.d I, I, -1 + + xvstelm.d x3, YY, 0 * SIZE, 0 + xvstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 1 + xvstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 2 + xvstelm.d x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.d x3, YY, 0 * SIZE, 3 + xvstelm.d x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#else + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 0 + xvinsgr2vr.w x2, t2, 0 + xvinsgr2vr.w x1, t3, 1 + xvinsgr2vr.w x2, t4, 1 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 2 + xvinsgr2vr.w x2, t2, 2 + xvinsgr2vr.w x1, t3, 3 + xvinsgr2vr.w x2, t4, 3 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 4 + xvinsgr2vr.w x2, t2, 4 + xvinsgr2vr.w x1, t3, 5 + xvinsgr2vr.w x2, t4, 5 + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + xvinsgr2vr.w x1, t1, 6 + xvinsgr2vr.w x2, t2, 6 + xvinsgr2vr.w x1, t3, 7 + xvinsgr2vr.w x2, t4, 7 + + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 0 + xvinsgr2vr.w x4, t2, 0 + xvinsgr2vr.w x3, t3, 1 + xvinsgr2vr.w x4, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 2 + xvinsgr2vr.w x4, t2, 2 + xvinsgr2vr.w x3, t3, 3 + xvinsgr2vr.w x4, t4, 3 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + add.d Y, Y, INCY + xvinsgr2vr.w x3, t1, 4 + xvinsgr2vr.w x4, t2, 4 + xvinsgr2vr.w x3, t3, 5 + xvinsgr2vr.w x4, t4, 5 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + xvinsgr2vr.w x3, t1, 6 + xvinsgr2vr.w x4, t2, 6 + xvinsgr2vr.w x3, t3, 7 + xvinsgr2vr.w x4, t4, 7 + add.d Y, Y, INCY + + XVFMUL VX0, VXAI, x2 + XVFMUL VX1, VXAI, x1 + XVFMUL VX2, VXBI, x4 + XVFMUL VX3, VXBI, x3 + XVMSUB VX0, VXAR, x1, VX0 + XVFMADD VX1, VXAR, x2, VX1 + XVMSUB VX2, VXBR, x3, VX2 + XVFMADD VX3, VXBR, x4, VX3 + XVFADD x3, VX0, VX2 + XVFADD x4, VX1, VX3 + addi.d I, I, -1 + + xvstelm.w x3, YY, 0 * SIZE, 0 + xvstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 1 + xvstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 2 + xvstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 3 + xvstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 4 + xvstelm.w x4, YY, 1 * SIZE, 4 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 5 + xvstelm.w x4, YY, 1 * SIZE, 5 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 6 + xvstelm.w x4, YY, 1 * SIZE, 6 + add.d YY, YY, INCY + xvstelm.w x3, YY, 0 * SIZE, 7 + xvstelm.w x4, YY, 1 * SIZE, 7 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#endif + +.L997: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L998: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + addi.d I, I, -1 + MUL s1, ALPHAI, a2 + MUL s2, ALPHAI, a1 + MUL s3, BETAI, a4 + MUL s4, BETAI, a3 + MSUB s1, ALPHAR, a1, s1 + MADD s2, a2, ALPHAR, s2 + MSUB s3, BETAR, a3, s3 + MADD s4, a4, BETAR, s4 + ADD s3, s3, s1 + ADD s4, s4, s2 + ST s3, Y, 0 * SIZE + ST s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 + +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE diff --git a/kernel/loongarch64/caxpby_lsx.S b/kernel/loongarch64/caxpby_lsx.S new file mode 100644 index 000000000..247ae428e --- /dev/null +++ b/kernel/loongarch64/caxpby_lsx.S @@ -0,0 +1,1029 @@ +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define ALPHAR $f0 +#define ALPHAI $f1 +#define X $r5 +#define INCX $r6 +#define BETAR $f2 +#define BETAI $f3 +#define Y $r7 +#define INCY $r8 + +#define I $r12 +#define TEMP $r13 +#define t1 $r14 +#define t2 $r16 +#define t3 $r15 +#define t4 $r17 +#define XX $r18 +#define YY $r19 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define s1 $f16 +#define s2 $f17 +#define s3 $f18 +#define s4 $f19 +#define VX0 $vr8 +#define VX1 $vr20 +#define VX2 $vr21 +#define VX3 $vr22 +#define VXAR $vr23 +#define VXAI $vr19 +#define VXBR $vr14 +#define VXBI $vr13 +#define VXZ $vr12 +#define x1 $vr18 +#define x2 $vr17 +#define x3 $vr16 +#define x4 $vr15 + + PROLOGUE + + bge $r0, N, .L999 + movgr2fr.d a1, $r0 +#ifdef DOUBLE + ffint.d.l a1, a1 +#else + ffint.s.l a1, a1 +#endif + slli.d INCX, INCX, ZBASE_SHIFT + slli.d INCY, INCY, ZBASE_SHIFT +#ifdef DOUBLE + movfr2gr.d t1, ALPHAR + vreplgr2vr.d VXAR, t1 + movfr2gr.d t2, ALPHAI + vreplgr2vr.d VXAI, t2 + movfr2gr.d t3, BETAR + vreplgr2vr.d VXBR, t3 + movfr2gr.d t4, BETAI + vreplgr2vr.d VXBI, t4 +#else + movfr2gr.s t1, ALPHAR + vreplgr2vr.w VXAR, t1 + movfr2gr.s t2, ALPHAI + vreplgr2vr.w VXAI, t2 + movfr2gr.s t3, BETAR + vreplgr2vr.w VXBR, t3 + movfr2gr.s t4, BETAI + vreplgr2vr.w VXBI, t4 +#endif + vxor.v VXZ, VXZ, VXZ + // If incx == 0 || incy == 0, do one by one + and TEMP, INCX, INCY + or I, N, N + beqz TEMP, .L998 + + li.d TEMP, 1 + slli.d TEMP, TEMP, ZBASE_SHIFT + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 + b .L11 // INCX==1 and INCY==1 +.L20: + bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 + b .L21 // INCX!=1 and INCY==1 + +.L11: + bge $r0, I, .L997 +#ifdef DOUBLE + fcmp.ceq.d $fcc0, BETAR, a1 + fcmp.ceq.d $fcc1, BETAI, a1 + fcmp.ceq.d $fcc2, ALPHAR, a1 + fcmp.ceq.d $fcc3, ALPHAI, a1 +#else + fcmp.ceq.s $fcc0, BETAR, a1 + fcmp.ceq.s $fcc1, BETAI, a1 + fcmp.ceq.s $fcc2, ALPHAR, a1 + fcmp.ceq.s $fcc3, ALPHAI, a1 +#endif + bceqz $fcc0, .L13 + bceqz $fcc1, .L13 + b .L14 + .align 3 + +.L13: + bceqz $fcc2, .L114 + bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + +.L14: + bceqz $fcc2, .L112 + bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 2 * SIZE + vst VXZ, Y, 4 * SIZE + vst VXZ, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 +#else + vst VXZ, Y, 0 * SIZE + vst VXZ, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L111 + b .L997 + .align 3 +#endif + +.L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXAI, x2 + vfmul.s x4, VXAI, x1 + vfmsub.s x3, VXAR, x1, x3 + vfmadd.s x4, VXAR, x2, x4 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L112 + b .L997 + .align 3 +#endif + +.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vld VX0, Y, 0 * SIZE + vld VX1, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + vld VX0, Y, 4 * SIZE + vld VX1, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 +#else + vld VX0, Y, 0 * SIZE + vld VX1, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s x3, VXBI, x2 + vfmul.s x4, VXBI, x1 + vfmsub.s x3, VXBR, x1, x3 + vfmadd.s x4, VXBR, x2, x4 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L113 + b .L997 + .align 3 +#endif + +.L114: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + addi.d I, I, -1 + blt $r0, I, .L114 + b .L997 + .align 3 +#endif + +.L12: // INCX==1 and INCY!=1 + bge $r0, I, .L997 + move YY, Y + .align 3 + +.L121: +#ifdef DOUBLE + vld VX0, X, 0 * SIZE + vld VX1, X, 2 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + vld VX0, X, 4 * SIZE + vld VX1, X, 6 * SIZE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vpickev.d x1, VX1, VX0 + vpickod.d x2, VX1, VX0 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#else + vld VX0, X, 0 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + + vld VX1, X, 4 * SIZE + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + + vpickev.w x1, VX1, VX0 + vpickod.w x2, VX1, VX0 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE + blt $r0, I, .L121 + b .L997 + .align 3 +#endif + +.L21:// INCX!=1 and INCY==1 + bge $r0, I, .L997 + .align 3 + +.L211: +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + vld VX3, Y, 2 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 2 * SIZE + + vld VX2, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vpickev.d x3, VX3, VX2 + vpickod.d x4, VX3, VX2 + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vilvl.d VX2, x4 ,x3 + vilvh.d VX3, x4, x3 + addi.d I, I, -1 + vst VX3, Y, 4 * SIZE + vst VX3, Y, 6 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + vld VX3, Y, 4 * SIZE + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + + vpickev.w x3, VX3, VX2 + vpickod.w x4, VX3, VX2 + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + vilvl.w VX2, x4 ,x3 + vilvh.w VX3, x4, x3 + addi.d I, I, -1 + vst VX2, Y, 0 * SIZE + vst VX3, Y, 4 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L211 + b .L997 + .align 3 +#endif + +.L22: + bge $r0, I, .L997 + move YY, Y +#ifdef DOUBLE + fcmp.ceq.d $fcc0, BETAR, a1 + fcmp.ceq.d $fcc1, BETAI, a1 + fcmp.ceq.d $fcc2, ALPHAR, a1 + fcmp.ceq.d $fcc3, ALPHAI, a1 +#else + fcmp.ceq.s $fcc0, BETAR, a1 + fcmp.ceq.s $fcc1, BETAI, a1 + fcmp.ceq.s $fcc2, ALPHAR, a1 + fcmp.ceq.s $fcc3, ALPHAI, a1 +#endif + bceqz $fcc0, .L23 + bceqz $fcc1, .L23 + b .L24 + .align 3 + +.L23: + bceqz $fcc2, .L224 + bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L24: + bceqz $fcc2, .L222 + bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) + b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) + .align 3 + +.L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.d VXZ, Y, 0, 0 + vstelm.d VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#else + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + vstelm.w VXZ, Y, 0, 0 + vstelm.w VXZ, Y, 0, 0 + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L221 + b .L997 + .align 3 +#endif + +.L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0) +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d X, X, INCX + vfmul.d x3, VXAI, x2 + vfmul.d x4, VXAI, x1 + vfmsub.d x3, VXAR, x1, x3 + vfmadd.d x4, VXAR, x2, x4 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d X, X, INCX + vfmul.s x3, VXAI, x2 + vfmul.s x4, VXAI, x1 + vfmsub.s x3, VXAR, x1, x3 + vfmadd.s x4, VXAR, x2, x4 + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L222 + b .L997 + .align 3 +#endif + +.L223: +#ifdef DOUBLE + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d Y, Y, INCY + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + add.d Y, Y, INCY + vfmul.d x3, VXBI, x2 + vfmul.d x4, VXBI, x1 + vfmsub.d x3, VXBR, x1, x3 + vfmadd.d x4, VXBR, x2, x4 + addi.d I, I, -1 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#else + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + add.d Y, Y, INCY + vfmul.s x3, VXBI, x2 + vfmul.s x4, VXBI, x1 + vfmsub.s x3, VXBR, x1, x3 + vfmadd.s x4, VXBR, x2, x4 + + addi.d I, I, -1 + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L223 + b .L997 + .align 3 +#endif + +.L224: +#ifdef DOUBLE + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + + ld.d t1, X, 0 * SIZE + ld.d t2, X, 1 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + ld.d t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.d x1, t1, 0 + vinsgr2vr.d x2, t2, 0 + vinsgr2vr.d x1, t3, 1 + vinsgr2vr.d x2, t4, 1 + ld.d t1, Y, 0 * SIZE + ld.d t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 1 * SIZE + vinsgr2vr.d x3, t1, 0 + vinsgr2vr.d x4, t2, 0 + vinsgr2vr.d x3, t3, 1 + vinsgr2vr.d x4, t4, 1 + add.d Y, Y, INCY + vfmul.d VX0, VXAI, x2 + vfmul.d VX1, VXAI, x1 + vfmul.d VX2, VXBI, x4 + vfmul.d VX3, VXBI, x3 + vfmsub.d VX0, VXAR, x1, VX0 + vfmadd.d VX1, VXAR, x2, VX1 + vfmsub.d VX2, VXBR, x3, VX2 + vfmadd.d VX3, VXBR, x4, VX3 + vfadd.d x3, VX0, VX2 + vfadd.d x4, VX1, VX3 + vstelm.d x3, YY, 0 * SIZE, 0 + vstelm.d x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.d x3, YY, 0 * SIZE, 1 + vstelm.d x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + addi.d I, I, -1 + blt $r0, I, .L224 + b .L997 + .align 3 +#else + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 0 + vinsgr2vr.w x2, t2, 0 + vinsgr2vr.w x1, t3, 1 + vinsgr2vr.w x2, t4, 1 + ld.w t1, X, 0 * SIZE + ld.w t2, X, 1 * SIZE + add.d X, X, INCX + ld.w t3, X, 0 * SIZE + ld.w t4, X, 1 * SIZE + add.d X, X, INCX + vinsgr2vr.w x1, t1, 2 + vinsgr2vr.w x2, t2, 2 + vinsgr2vr.w x1, t3, 3 + vinsgr2vr.w x2, t4, 3 + + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + add.d Y, Y, INCY + vinsgr2vr.w x3, t1, 0 + vinsgr2vr.w x4, t2, 0 + vinsgr2vr.w x3, t3, 1 + vinsgr2vr.w x4, t4, 1 + ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 1 * SIZE + add.d Y, Y, INCY + ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 1 * SIZE + vinsgr2vr.w x3, t1, 2 + vinsgr2vr.w x4, t2, 2 + vinsgr2vr.w x3, t3, 3 + vinsgr2vr.w x4, t4, 3 + add.d Y, Y, INCY + vfmul.s VX0, VXAI, x2 + vfmul.s VX1, VXAI, x1 + vfmul.s VX2, VXBI, x4 + vfmul.s VX3, VXBI, x3 + vfmsub.s VX0, VXAR, x1, VX0 + vfmadd.s VX1, VXAR, x2, VX1 + vfmsub.s VX2, VXBR, x3, VX2 + vfmadd.s VX3, VXBR, x4, VX3 + vfadd.s x3, VX0, VX2 + vfadd.s x4, VX1, VX3 + addi.d I, I, -1 + + vstelm.w x3, YY, 0 * SIZE, 0 + vstelm.w x4, YY, 1 * SIZE, 0 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 1 + vstelm.w x4, YY, 1 * SIZE, 1 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 2 + vstelm.w x4, YY, 1 * SIZE, 2 + add.d YY, YY, INCY + vstelm.w x3, YY, 0 * SIZE, 3 + vstelm.w x4, YY, 1 * SIZE, 3 + add.d YY, YY, INCY + blt $r0, I, .L224 + b .L997 + .align 3 +#endif + +.L997: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L998: +#ifdef DOUBLE + fld.d a1, X, 0 * SIZE + fld.d a2, X, 1 * SIZE + fld.d a3, Y, 0 * SIZE + fld.d a4, Y, 1 * SIZE + addi.d I, I, -1 + fmul.d s1, ALPHAI, a2 + fmul.d s2, ALPHAI, a1 + fmul.d s3, BETAI, a4 + fmul.d s4, BETAI, a3 + fmsub.d s1, ALPHAR, a1, s1 + fmadd.d s2, a2, ALPHAR, s2 + fmsub.d s3, BETAR, a3, s3 + fmadd.d s4, a4, BETAR, s4 + fadd.d s3, s3, s1 + fadd.d s4, s4, s2 + fst.d s3, Y, 0 * SIZE + fst.d s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 +#else + fld.s a1, X, 0 * SIZE + fld.s a2, X, 1 * SIZE + fld.s a3, Y, 0 * SIZE + fld.s a4, Y, 1 * SIZE + addi.d I, I, -1 + fmul.s s1, ALPHAI, a2 + fmul.s s2, ALPHAI, a1 + fmul.s s3, BETAI, a4 + fmul.s s4, BETAI, a3 + fmsub.s s1, ALPHAR, a1, s1 + fmadd.s s2, a2, ALPHAR, s2 + fmsub.s s3, BETAR, a3, s3 + fmadd.s s4, a4, BETAR, s4 + fadd.s s3, s3, s1 + fadd.s s4, s4, s2 + fst.s s3, Y, 0 * SIZE + fst.s s4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L998 + .align 3 +#endif +.L999: + move $r4, $r12 + jirl $r0, $r1, 0x0 + .align 3 + + EPILOGUE From adde7253217a6824da1deff064b46e36c1e259ca Mon Sep 17 00:00:00 2001 From: gxw Date: Sun, 4 Feb 2024 14:43:08 +0800 Subject: [PATCH 635/718] LoongArch64: Fixed {s/d}amin LSX optimization --- kernel/loongarch64/amin_lsx.S | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S index c3c3f4ae9..47701b6e4 100644 --- a/kernel/loongarch64/amin_lsx.S +++ b/kernel/loongarch64/amin_lsx.S @@ -124,7 +124,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L13: FABS $f0, $f0 - SUB $f0, $f0, $f0 jirl $r0, $r1, 0x0 .align 3 From d02c61e82e82b7f721ecf50dd7717547b6302a19 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Feb 2024 10:01:27 +0100 Subject: [PATCH 636/718] Update lowercase cpunames for RISC-V --- cpuid_riscv64.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c index 5db54f1aa..ff7ba2aad 100644 --- a/cpuid_riscv64.c +++ b/cpuid_riscv64.c @@ -86,7 +86,10 @@ static char *cpuname[] = { static char *cpuname_lower[] = { "riscv64_generic", - "c910v" + "c910v", + "x280", + "riscv64_zvl256b", + "riscv64_zvl128b" }; int detect(void){ From e61d96303d4972181e1d1b26b1f3a88ed9e5de02 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Feb 2024 10:05:20 +0100 Subject: [PATCH 637/718] Fix missing NO_AVX2 fallback for SapphireRapids --- driver/others/dynamic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 69a473060..e3f905265 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR; #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE +#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; From 6d8a273cca96c16873a15b59ea724834aa9b4558 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Feb 2024 22:07:51 +0100 Subject: [PATCH 638/718] Handle zero increment(s) in C910V ?AXPBY (#4483) * Handle zero increment(s) --- kernel/riscv64/axpby_vector.c | 58 +++++++++++++++++++++++++ kernel/riscv64/zaxpby_vector.c | 77 ++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c index 850fc903e..721aad2b0 100644 --- a/kernel/riscv64/axpby_vector.c +++ b/kernel/riscv64/axpby_vector.c @@ -69,6 +69,63 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * BLASLONG stride_x, stride_y, ix = 0, iy = 0; + if (inc_x == 0 || inc_y == 0) { /* use trivial non-vectorized loop if either increment is zero */ + + if ( beta == 0.0 ) + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = 0.0 ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + + + } + + } + else + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = beta * y[iy] ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] + beta * y[iy] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + + } + + return(0); + + } else { /* vectorized approach for non-zero increments */ + if(beta == 0.0){ if(alpha == 0.0){//alpha == 0 && beta == 0 if(inc_y == 1){ @@ -381,5 +438,6 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * } } return(0); + } } diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c index d5ad974cf..bbf2bbe7d 100644 --- a/kernel/riscv64/zaxpby_vector.c +++ b/kernel/riscv64/zaxpby_vector.c @@ -62,6 +62,82 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL stride_x = inc_x * 2 * sizeof(FLOAT); stride_y = inc_y * 2 * sizeof(FLOAT); + if (inc_x == 0 || inc_y == 0) { + + FLOAT temp; + BLASLONG inc_x2, inc_y2; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + y[iy] = 0.0 ; + y[iy+1] = 0.0 ; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + } + else + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + + + } + return(0); + + } else { + if(beta_r == 0.0 && beta_i == 0.0){ if(alpha_r == 0.0 && alpha_i == 0.0){ if(inc_y == 1){ @@ -191,5 +267,6 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL } } return(0); + } } From 479e4af0893f3772eff9a75e89cf28bef91383c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 5 Feb 2024 15:35:24 +0100 Subject: [PATCH 639/718] Rescale input vector more often to minimize relative error (Reference-LAPACK PR 981) --- lapack-netlib/SRC/clarfgp.f | 30 ++++++++++-------------------- lapack-netlib/SRC/zlarfgp.f | 30 ++++++++++-------------------- 2 files changed, 20 insertions(+), 40 deletions(-) diff --git a/lapack-netlib/SRC/clarfgp.f b/lapack-netlib/SRC/clarfgp.f index 47b5e47b0..980e93612 100644 --- a/lapack-netlib/SRC/clarfgp.f +++ b/lapack-netlib/SRC/clarfgp.f @@ -148,33 +148,23 @@ ALPHR = REAL( ALPHA ) ALPHI = AIMAG( ALPHA ) * - IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * - IF( ALPHI.EQ.ZERO ) THEN - IF( ALPHR.GE.ZERO ) THEN -* When TAU.eq.ZERO, the vector is special-cased to be -* all zeros in the application routines. We do not need -* to clear it. - TAU = ZERO - ELSE -* However, the application routines rely on explicit -* zero checks when TAU.ne.ZERO, and we must clear X. - TAU = TWO - DO J = 1, N-1 - X( 1 + (J-1)*INCX ) = ZERO - END DO - ALPHA = -ALPHA - END IF + IF( ALPHR.GE.ZERO ) THEN +* When TAU.eq.ZERO, the vector is special-cased to be +* all zeros in the application routines. We do not need +* to clear it. + TAU = ZERO ELSE -* Only "reflecting" the diagonal entry to be real and non-negative. - XNORM = SLAPY2( ALPHR, ALPHI ) - TAU = CMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM ) +* However, the application routines rely on explicit +* zero checks when TAU.ne.ZERO, and we must clear X. + TAU = TWO DO J = 1, N-1 X( 1 + (J-1)*INCX ) = ZERO END DO - ALPHA = XNORM + ALPHA = -ALPHA END IF ELSE * diff --git a/lapack-netlib/SRC/zlarfgp.f b/lapack-netlib/SRC/zlarfgp.f index 6c9efb04c..d54f2ea5d 100644 --- a/lapack-netlib/SRC/zlarfgp.f +++ b/lapack-netlib/SRC/zlarfgp.f @@ -148,33 +148,23 @@ ALPHR = DBLE( ALPHA ) ALPHI = DIMAG( ALPHA ) * - IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN + IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN * * H = [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0. * - IF( ALPHI.EQ.ZERO ) THEN - IF( ALPHR.GE.ZERO ) THEN -* When TAU.eq.ZERO, the vector is special-cased to be -* all zeros in the application routines. We do not need -* to clear it. - TAU = ZERO - ELSE -* However, the application routines rely on explicit -* zero checks when TAU.ne.ZERO, and we must clear X. - TAU = TWO - DO J = 1, N-1 - X( 1 + (J-1)*INCX ) = ZERO - END DO - ALPHA = -ALPHA - END IF + IF( ALPHR.GE.ZERO ) THEN +* When TAU.eq.ZERO, the vector is special-cased to be +* all zeros in the application routines. We do not need +* to clear it. + TAU = ZERO ELSE -* Only "reflecting" the diagonal entry to be real and non-negative. - XNORM = DLAPY2( ALPHR, ALPHI ) - TAU = DCMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM ) +* However, the application routines rely on explicit +* zero checks when TAU.ne.ZERO, and we must clear X. + TAU = TWO DO J = 1, N-1 X( 1 + (J-1)*INCX ) = ZERO END DO - ALPHA = XNORM + ALPHA = -ALPHA END IF ELSE * From 63fbffddf86e2443e7aa4eda4fdad2db84b17568 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 5 Feb 2024 21:44:03 +0100 Subject: [PATCH 640/718] Add option FIXED_LIBNAME to suppress versioning and softlinking --- Makefile | 25 +++++++++++++++---------- Makefile.system | 4 ++++ 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index b344abcd2..e595ad99a 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,9 @@ TOPDIR = . include ./Makefile.system +LNCMD = ln -fs +ifeq ($(FIXED_LIBNAME), 1) +LNCMD = true +endif BLASDIRS = interface driver/level2 driver/level3 driver/others @@ -134,17 +138,17 @@ shared : libs netlib $(RELA) ifneq ($(NO_SHARED), 1) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) + @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so + @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so + @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn - @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib - @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib + @$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib + @$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll @@ -229,13 +233,13 @@ ifeq ($(INTERFACE64),1) endif @echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last @echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last - @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + @-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @touch lib.grd prof : prof_blas prof_lapack prof_blas : - ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) + $(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d prof || exit 1 ; \ @@ -246,7 +250,7 @@ ifeq ($(DYNAMIC_ARCH), 1) endif blas : - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d libs || exit 1 ; \ @@ -254,7 +258,7 @@ blas : done hpl : - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ../laswp exports ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -268,7 +272,7 @@ ifeq ($(DYNAMIC_ARCH), 1) endif hpl_p : - ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) + $(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) for d in $(SUBDIRS) ../laswp exports ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -401,6 +405,7 @@ lapack-runtest: lapack-test blas-test: (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) diff --git a/Makefile.system b/Makefile.system index 0088eaff5..0353a4d21 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1704,6 +1704,10 @@ LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX) endif endif +ifeq ($(FIXED_LIBNAME),1) + LIBNAME = $(LIBPREFIX).$(LIBSUFFIX) + LIBNAME_P = $(LIBPREFIX)_p.$(LIBSUFFIX) +endif LIBDLLNAME = $(LIBPREFIX).dll IMPLIBNAME = lib$(LIBNAMEBASE).dll.a From 440edfd997a4a383ced5226553c15e50c1c750b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 5 Feb 2024 21:44:50 +0100 Subject: [PATCH 641/718] Add option to suppress versioning of the internal name --- exports/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 7682f851d..71c112c67 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -132,8 +132,12 @@ libgoto_hpl.def : $(GENSYM) ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) ifeq ($(OSNAME), Darwin) +ifeq ($(FIXED_LIBNAME),1) +INTERNALNAME = $(LIBPREFIX).dylib +else INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib endif +endif ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def @@ -169,8 +173,12 @@ INTERNALNAME = $(LIBPREFIX).so FEXTRALIB += -lm EXTRALIB += -lm else +ifeq ($(FIXED_LIBNAME),1) +INTERNALNAME = $(LIBPREFIX).so +else INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) endif +endif ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) ../$(LIBSONAME) : ../$(LIBNAME) linktest.c From fe3da43b7dac7af1be5538f87f707a6073fbc52c Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 6 Feb 2024 11:49:01 +0800 Subject: [PATCH 642/718] Optimized zgemm kernel 8*4 LASX, 4*4 LSX and cgemm kernel 8*4 LSX for LoongArch --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 20 +- kernel/loongarch64/KERNEL.LOONGSON3R5 | 10 +- kernel/loongarch64/cgemm_kernel_8x4_lsx.S | 3313 ++++++++++++++++++ kernel/loongarch64/cgemm_ncopy_4_lsx.S | 341 ++ kernel/loongarch64/cgemm_ncopy_8_lsx.S | 263 ++ kernel/loongarch64/cgemm_tcopy_4_lsx.S | 324 ++ kernel/loongarch64/cgemm_tcopy_8_lsx.S | 277 ++ kernel/loongarch64/zgemm_kernel_4x4_lsx.S | 2316 +++++++++++++ kernel/loongarch64/zgemm_kernel_8x4_lasx.S | 3545 ++++++++++++++++++++ kernel/loongarch64/zgemm_ncopy_4_lasx.S | 320 ++ kernel/loongarch64/zgemm_ncopy_4_lsx.S | 332 ++ kernel/loongarch64/zgemm_ncopy_8_lasx.S | 263 ++ kernel/loongarch64/zgemm_tcopy_4_lasx.S | 302 ++ kernel/loongarch64/zgemm_tcopy_4_lsx.S | 355 ++ kernel/loongarch64/zgemm_tcopy_8_lasx.S | 268 ++ param.h | 10 +- 16 files changed, 12248 insertions(+), 11 deletions(-) create mode 100644 kernel/loongarch64/cgemm_kernel_8x4_lsx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_4_lsx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_8_lsx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_4_lsx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_8_lsx.S create mode 100644 kernel/loongarch64/zgemm_kernel_4x4_lsx.S create mode 100644 kernel/loongarch64/zgemm_kernel_8x4_lasx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_4_lasx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_4_lsx.S create mode 100644 kernel/loongarch64/zgemm_ncopy_8_lasx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_4_lasx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_4_lsx.S create mode 100644 kernel/loongarch64/zgemm_tcopy_8_lasx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index f4ab495e6..c7ef44035 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -100,9 +100,13 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMMKERNEL = cgemm_kernel_2x2_lsx.S -CGEMMONCOPY = cgemm_ncopy_2_lsx.S -CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMKERNEL = cgemm_kernel_8x4_lsx.S +CGEMMINCOPY = cgemm_ncopy_8_lsx.S +CGEMMITCOPY = cgemm_tcopy_8_lsx.S +CGEMMONCOPY = cgemm_ncopy_4_lsx.S +CGEMMOTCOPY = cgemm_tcopy_4_lsx.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -111,4 +115,14 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S +ZGEMMONCOPY = zgemm_ncopy_4_lsx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index bd85fab01..17d15656a 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -122,9 +122,13 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S -ZGEMMONCOPY = zgemm_ncopy_2_lasx.S -ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S +ZGEMMKERNEL = zgemm_kernel_8x4_lasx.S +ZGEMMINCOPY = zgemm_ncopy_8_lasx.S +ZGEMMITCOPY = zgemm_tcopy_8_lasx.S +ZGEMMONCOPY = zgemm_ncopy_4_lasx.S +ZGEMMOTCOPY = zgemm_tcopy_4_lasx.S +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/loongarch64/cgemm_kernel_8x4_lsx.S b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S new file mode 100644 index 000000000..1e9fd8524 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S @@ -0,0 +1,3313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LSX vectors */ +#define U0 $vr30 +#define U1 $vr31 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define VALPHAR $vr28 +#define VALPHAI $vr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + vldrepl.w VALPHAR, $sp, 112 + vldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vand.v D0, U2, U2 + vand.v D1, U3, U3 + vand.v D2, U2, U2 + vand.v D3, U3, U3 + + vpermi.w D0, U0, 0x44 + vpermi.w D2, U0, 0xee + vpermi.w D1, U1, 0x44 + vpermi.w D3, U1, 0xee + + vst D0, TD, 0x00 + vst D2, TD, 0x10 + vst D1, TD, 0x20 + vst D3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vand.v D0, U1, U1 + + vpermi.w D0, U0, 0x44 + vpermi.w U1, U0, 0xee + + vst D0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x10 // a_offset + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d TD, TD, 0x10 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d S1, S1, 0x10 // aoffset1 + addi.d TD, TD, 0x10 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_ncopy_8_lsx.S b/kernel/loongarch64/cgemm_ncopy_8_lsx.S new file mode 100644 index 000000000..87a88e37d --- /dev/null +++ b/kernel/loongarch64/cgemm_ncopy_8_lsx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 +#define D8 $vr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + fld.s F0, S5, 0x00 + fld.s F1, S5, 0x04 + fld.s F2, S6, 0x00 + fld.s F3, S6, 0x04 + fld.s F4, S7, 0x00 + fld.s F5, S7, 0x04 + fld.s F6, S8, 0x00 + fld.s F7, S8, 0x04 + + fst.s F0, TD, 0x20 + fst.s F1, TD, 0x24 + fst.s F2, TD, 0x28 + fst.s F3, TD, 0x2c + fst.s F4, TD, 0x30 + fst.s F5, TD, 0x34 + fst.s F6, TD, 0x38 + fst.s F7, TD, 0x3c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + + addi.d S1, S1, 0x08 + addi.d TD, TD, 0x08 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_4_lsx.S b/kernel/loongarch64/cgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..6d63d62e7 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_4_lsx.S @@ -0,0 +1,324 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S9, S9, 0x40 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + fst.s F4, S10, 0x10 + fst.s F5, S10, 0x14 + fst.s F6, S10, 0x18 + fst.s F7, S10, 0x1c + + addi.d S10, S10, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S9, S9, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + + addi.d S10, S10, 0x10 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + + addi.d S1, S1, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + + vst U0, S9, 0x00 + + addi.d S1, S1, 0x10 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_8_lsx.S b/kernel/loongarch64/cgemm_tcopy_8_lsx.S new file mode 100644 index 000000000..2935bbc07 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_8_lsx.S @@ -0,0 +1,277 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define D0 $vr8 +#define D1 $vr9 +#define D2 $vr10 +#define D3 $vr11 +#define D4 $vr12 +#define D5 $vr13 +#define D6 $vr14 +#define D7 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + vst U4, TD, 0x40 + vst U5, TD, 0x50 + vst U6, TD, 0x60 + vst U7, TD, 0x70 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x10 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + + addi.d TD, TD, 0x10 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_kernel_4x4_lsx.S b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S new file mode 100644 index 000000000..6c4841b24 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S @@ -0,0 +1,2316 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LSX vectors */ +#define U0 $vr30 +#define U1 $vr31 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define D0 $vr16 +#define D1 $vr17 +#define D2 $vr18 +#define D3 $vr19 +#define D4 $vr20 +#define D5 $vr21 +#define D6 $vr22 +#define D7 $vr23 +#define D8 $vr24 +#define D9 $vr25 +#define D10 $vr26 +#define D11 $vr27 +#define D12 $vr28 +#define D13 $vr29 +#define VALPHAR $vr28 +#define VALPHAI $vr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + vldrepl.d VALPHAR, $sp, 112 + vldrepl.d VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, BASE_SHIFT + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + xvand.v D2, U2, U2 + xvand.v D3, U3, U3 + + xvpermi.q D0, U2, 0x02 + xvpermi.q D2, U0, 0x31 + xvpermi.q D1, U3, 0x02 + xvpermi.q D3, U1, 0x31 + + xvst D0, TD, 0x00 + xvst D2, TD, 0x20 + xvst D1, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvand.v D0, U0, U0 + + xvpermi.q D0, U1, 0x02 + xvpermi.q U1, U0, 0x31 + + xvst D0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + xvld U0, S1, 0x00 + xvst U0, TD, 0x00 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld $vr0, S1, 0x00 + vst $vr0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_4_lsx.S b/kernel/loongarch64/zgemm_ncopy_4_lsx.S new file mode 100644 index 000000000..203471cbd --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_4_lsx.S @@ -0,0 +1,332 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define TD $r20 +#define TS $r11 +#define TL $r19 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 + slli.d TL, TL, 0x01 + + srai.d J, N, 0x02 + beq J, ZERO, .L_N0 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U8, TD, 0x20 + vst U12, TD, 0x30 + + vst U1, TD, 0x40 + vst U5, TD, 0x50 + vst U9, TD, 0x60 + vst U13, TD, 0x70 + + vst U2, TD, 0x80 + vst U6, TD, 0x90 + vst U10, TD, 0xa0 + vst U14, TD, 0xb0 + + vst U3, TD, 0xc0 + vst U7, TD, 0xd0 + vst U11, TD, 0xe0 + vst U15, TD, 0xf0 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d TD, TD, 0x100 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_II20 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U4, TD, 0x20 + vst U6, TD, 0x30 + + vst U1, TD, 0x40 + vst U3, TD, 0x50 + vst U5, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + +.L_II20: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d TD, TD, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N0: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N20 + + move S1, TS + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + srai.d I, M, 0x02 + beq ZERO, I, .L_N10 + +.L_N11: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, TD, 0x00 + vst U4, TD, 0x10 + vst U1, TD, 0x20 + vst U5, TD, 0x30 + + vst U2, TD, 0x40 + vst U6, TD, 0x50 + vst U3, TD, 0x60 + vst U7, TD, 0x70 + + addi.d S1, S1, 0x40 // a_offset + addi.d S2, S2, 0x40 + addi.d TD, TD, 0x80 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, TD, 0x00 + vst U2, TD, 0x10 + vst U1, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d TD, TD, 0x20 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + vst U2, TD, 0x20 + vst U3, TD, 0x30 + + addi.d S1, S1, 0x40 // aoffset1 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, TD, 0x00 + vst U1, TD, 0x10 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + vld U0, S1, 0x00 + + vst U0, TD, 0x00 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_ncopy_8_lasx.S b/kernel/loongarch64/zgemm_ncopy_8_lasx.S new file mode 100644 index 000000000..7cd8f605b --- /dev/null +++ b/kernel/loongarch64/zgemm_ncopy_8_lasx.S @@ -0,0 +1,263 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST //boffset + move TS, SRC //aoffset + + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + slli.d T0, TL, 0x03 + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, TS, T0 + + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + fld.d F0, S5, 0x00 + fld.d F1, S5, 0x08 + fld.d F2, S6, 0x00 + fld.d F3, S6, 0x08 + fld.d F4, S7, 0x00 + fld.d F5, S7, 0x08 + fld.d F6, S8, 0x00 + fld.d F7, S8, 0x08 + + fst.d F0, TD, 0x40 + fst.d F1, TD, 0x48 + fst.d F2, TD, 0x50 + fst.d F3, TD, 0x58 + fst.d F4, TD, 0x60 + fst.d F5, TD, 0x68 + fst.d F6, TD, 0x70 + fst.d F7, TD, 0x78 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d S3, S2, TL + add.d S4, S3, TL + add.d TS, S4, TL + + beq I, ZERO, .L_N2 + +.L_N11: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + fld.d F4, S3, 0x00 + fld.d F5, S3, 0x08 + fld.d F6, S4, 0x00 + fld.d F7, S4, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N3 + + move S1, TS + add.d S2, TS, TL + move I, M + add.d TS, S2, TL + + beq I, ZERO, .L_N3 + +.L_N21: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + fld.d F2, S2, 0x00 + fld.d F3, S2, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N3: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 + + move S1, TS + move I, M + + beq I, ZERO, .L_N0 + +.L_N31: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + + addi.d S1, S1, 0x10 + addi.d TD, TD, 0x10 + + addi.d I, I, -1 + blt ZERO, I, .L_N31 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lasx.S b/kernel/loongarch64/zgemm_tcopy_4_lasx.S new file mode 100644 index 000000000..1adee11c5 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lasx.S @@ -0,0 +1,302 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + xvst U4, S8, 0x80 + xvst U5, S8, 0xa0 + xvst U6, S8, 0xc0 + xvst U7, S8, 0xe0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + xvst U2, S9, 0x40 + xvst U3, S9, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + vst $vr2, S10, 0x20 + vst $vr3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S9, 0x00 + xvst U1, S9, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S10, 0x00 + vst $vr1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + xvld U0, S1, 0x00 + + xvst U0, S9, 0x00 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld $vr0, S1, 0x00 + + vst $vr0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_4_lsx.S b/kernel/loongarch64/zgemm_tcopy_4_lsx.S new file mode 100644 index 000000000..954753eaf --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_4_lsx.S @@ -0,0 +1,355 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LSX vectors */ +#define U0 $vr0 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.d T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x03 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x100 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vld U8, S3, 0x00 + vld U9, S3, 0x10 + vld U10, S3, 0x20 + vld U11, S3, 0x30 + + vld U12, S4, 0x00 + vld U13, S4, 0x10 + vld U14, S4, 0x20 + vld U15, S4, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + vst U8, S8, 0x80 + vst U9, S8, 0x90 + vst U10, S8, 0xa0 + vst U11, S8, 0xb0 + vst U12, S8, 0xc0 + vst U13, S8, 0xd0 + vst U14, S8, 0xe0 + vst U15, S8, 0xf0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vld U4, S3, 0x00 + vld U5, S3, 0x10 + + vld U6, S4, 0x00 + vld U7, S4, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + vst U4, S9, 0x40 + vst U5, S9, 0x50 + vst U6, S9, 0x60 + vst U7, S9, 0x70 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S9, S9, 0x80 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + vld U2, S3, 0x00 + vld U3, S4, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + vst U2, S10, 0x20 + vst U3, S10, 0x30 + + addi.d S10, S10, 0x40 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vld U4, S2, 0x00 + vld U5, S2, 0x10 + vld U6, S2, 0x20 + vld U7, S2, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + vst U4, S8, 0x40 + vst U5, S8, 0x50 + vst U6, S8, 0x60 + vst U7, S8, 0x70 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vld U2, S2, 0x00 + vld U3, S2, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + vst U2, S9, 0x20 + vst U3, S9, 0x30 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S9, S9, 0x40 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + vld U0, S1, 0x00 + vld U1, S2, 0x00 + + vst U0, S10, 0x00 + vst U1, S10, 0x10 + + addi.d S10, S10, 0x20 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + vld U0, S1, 0x00 + vld U1, S1, 0x10 + vld U2, S1, 0x20 + vld U3, S1, 0x30 + + vst U0, S8, 0x00 + vst U1, S8, 0x10 + vst U2, S8, 0x20 + vst U3, S8, 0x30 + + addi.d S1, S1, 0x40 + slli.d T0, M, 0x06 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld U0, S1, 0x00 + vld U1, S1, 0x10 + + vst U0, S9, 0x00 + vst U1, S9, 0x10 + + addi.d S1, S1, 0x20 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + vld U0, S1, 0x00 + + vst U0, S10, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/zgemm_tcopy_8_lasx.S b/kernel/loongarch64/zgemm_tcopy_8_lasx.S new file mode 100644 index 000000000..f7440dc24 --- /dev/null +++ b/kernel/loongarch64/zgemm_tcopy_8_lasx.S @@ -0,0 +1,268 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x03 //lda + slli.d TL, TL, 0x01 + + srai.d J, N, 0x03 //j + + beq J, ZERO, .L_N1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x80 + + srai.d I, M, 0x01 + beq ZERO, I, .L_J1M1 + +.L_J1I1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + xvst U4, TD, 0x80 + xvst U5, TD, 0xa0 + xvst U6, TD, 0xc0 + xvst U7, TD, 0xe0 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x100 + + addi.d I, I, -1 + blt ZERO, I, .L_J1I1 + +.L_J1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_J0 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + addi.d TD, TD, 0x80 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N1: /* if(n&4) */ + andi I, N, 0x04 + beq ZERO, I, .L_N2 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x40 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N1M1 + +.L_N1I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + xvst U2, TD, 0x40 + xvst U3, TD, 0x60 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_N1I1 + +.L_N1M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N2 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + addi.d TD, TD, 0x40 + +.L_N2: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_N3 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + addi.d TS, TS, 0x20 + + srai.d I, M, 0x01 + beq ZERO, I, .L_N2M1 + +.L_N2I1: /* if(i>0) i-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, TD, 0x00 + xvst U1, TD, 0x20 + + add.d S1, S1, T0 + add.d S2, S2, T0 + + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_N2I1 + +.L_N2M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + + xvst U0, TD, 0x00 + + addi.d TD, TD, 0x20 + +.L_N3: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + move S1, TS //aoffset1 + slli.d T0, TL, 0x01 //2*lda + add.d S2, TS, TL + + srai.d I, M, 0x01 + beq ZERO, I, .L_N3M1 + +.L_N3I1: /* if(i>0) i-- */ + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, TD, 0x00 + vst $vr1, TD, 0x10 + + add.d S1, S1, T0 + add.d S2, S2, T0 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_N3I1 + +.L_N3M1: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_N0 + + vld $vr0, S1, 0x00 + + vst $vr0, TD, 0x00 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/param.h b/param.h index 003ea396f..5d2e960a2 100644 --- a/param.h +++ b/param.h @@ -2854,12 +2854,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 8 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 @@ -2891,10 +2891,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_P 128 From d4db6a9f16a5c82bbe1860f591cc731c4d83d7c8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Feb 2024 22:23:47 +0100 Subject: [PATCH 643/718] Separate the interface for SBGEMMT from GEMMT due to differences in GEMV arguments --- interface/CMakeLists.txt | 1 + interface/Makefile | 8 +- interface/gemmt.c | 3 +- interface/sbgemmt.c | 447 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 452 insertions(+), 7 deletions(-) create mode 100644 interface/sbgemmt.c diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index ed19b556a..55374674a 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -119,6 +119,7 @@ endif () if (BUILD_BFLOAT16) GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") diff --git a/interface/Makefile b/interface/Makefile index ad4a0fb89..048d679d6 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1303,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) -sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h +sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) endif @@ -1662,10 +1662,6 @@ cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) -sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1971,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) ifeq ($(BUILD_BFLOAT16),1) -cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h +cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif diff --git a/interface/gemmt.c b/interface/gemmt.c index 8fd8089d0..018deb7fb 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -158,7 +158,8 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, uplo = 0; if (Uplo == 'L') uplo = 1; - + + nrowa = m; if (transa & 1) nrowa = k; nrowb = k; #if defined(COMPLEX) diff --git a/interface/sbgemmt.c b/interface/sbgemmt.c new file mode 100644 index 000000000..759af4bfb --- /dev/null +++ b/interface/sbgemmt.c @@ -0,0 +1,447 @@ +/*********************************************************************/ +/* Copyright 2024, The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#define SMP_THRESHOLD_MIN 65536.0 +#define ERROR_NAME "SBGEMMT " + +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANSA, char *TRANSB, + blasint * M, blasint * K, + FLOAT * Alpha, + IFLOAT * a, blasint * ldA, + IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) +{ + + blasint m, k; + blasint lda, ldb, ldc; + int transa, transb, uplo; + blasint info; + + char transA, transB, Uplo; + blasint nrowa, nrowb; + IFLOAT *buffer; + IFLOAT *aa, *bb; + FLOAT *cc; + FLOAT alpha, beta; + + PRINT_DEBUG_NAME; + + m = *M; + k = *K; + + alpha = *Alpha; + beta = *Beta; + + lda = *ldA; + ldb = *ldB; + ldc = *ldC; + + transA = *TRANSA; + transB = *TRANSB; + Uplo = *UPLO; + TOUPPER(transA); + TOUPPER(transB); + TOUPPER(Uplo); + + transa = -1; + transb = -1; + uplo = -1; + + if (transA == 'N') + transa = 0; + if (transA == 'T') + transa = 1; + + if (transA == 'R') + transa = 0; + if (transA == 'C') + transa = 1; + + if (transB == 'N') + transb = 0; + if (transB == 'T') + transb = 1; + + if (transB == 'R') + transb = 0; + if (transB == 'C') + transb = 1; + + if (Uplo == 'U') + uplo = 0; + if (Uplo == 'L') + uplo = 1; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; + if (transb & 1) nrowb = m; + + info = 0; + + if (ldc < MAX(1, m)) + info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 3; + if (transa < 0) + info = 2; + if (uplo < 0) + info = 1; + + if (info != 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, + blasint k, + FLOAT alpha, + IFLOAT * A, blasint LDA, + IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) +{ + IFLOAT *aa, *bb; + FLOAT *cc; + + int transa, transb, uplo; + blasint info; + blasint lda, ldb; + IFLOAT *a, *b; + XFLOAT *buffer; + + PRINT_DEBUG_CNAME; + + uplo = -1; + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) + transa = 0; + if (TransA == CblasTrans) + transa = 1; + + if (TransA == CblasConjNoTrans) + transa = 0; + if (TransA == CblasConjTrans) + transa = 1; + + if (TransB == CblasNoTrans) + transb = 0; + if (TransB == CblasTrans) + transb = 1; + + if (TransB == CblasConjNoTrans) + transb = 0; + if (TransB == CblasConjTrans) + transb = 1; + + a = (void *)A; + b = (void *)B; + lda = LDA; + ldb = LDB; + + info = -1; + + blasint nrowa; + blasint nrowb; + nrowa = m; + if (transa & 1) nrowa = k; + nrowb = k; + if (transb & 1) nrowb = m; + + if (ldc < MAX(1, m)) + info = 13; + if (ldb < MAX(1, nrowb)) + info = 10; + if (lda < MAX(1, nrowa)) + info = 8; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 3; + if (transa < 0) + info = 2; + if (uplo < 0) + info = 1; + } + + if (order == CblasRowMajor) { + + a = (void *)B; + b = (void *)A; + + lda = LDB; + ldb = LDA; + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransB == CblasNoTrans) + transa = 0; + if (TransB == CblasTrans) + transa = 1; + + if (TransB == CblasConjNoTrans) + transa = 0; + if (TransB == CblasConjTrans) + transa = 1; + + if (TransA == CblasNoTrans) + transb = 0; + if (TransA == CblasTrans) + transb = 1; + + if (TransA == CblasConjNoTrans) + transb = 0; + if (TransA == CblasConjTrans) + transb = 1; + + info = -1; + + blasint ncola; + blasint ncolb; + + ncola = m; + if (transa & 1) ncola = k; + ncolb = k; + + if (transb & 1) { + ncolb = m; + } + + if (ldc < MAX(1,m)) + info = 13; + if (ldb < MAX(1, ncolb)) + info = 8; + if (lda < MAX(1, ncola)) + info = 10; + if (k < 0) + info = 5; + if (m < 0) + info = 4; + if (transb < 0) + info = 2; + if (transa < 0) + info = 3; + if (uplo < 0) + info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + int buffer_size; + blasint i, j; + +#ifdef SMP + int nthreads; +#endif + + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, + BLASLONG, IFLOAT *, BLASLONG, FLOAT, + FLOAT *, BLASLONG, int) = { + sbgemv_thread_n, sbgemv_thread_t, + }; +#endif + int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, + IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { + SBGEMV_N, SBGEMV_T,}; + + + if (m == 0) + return; + + IDEBUG_START; + + const blasint incb = ((transb & 1) == 0) ? 1 : ldb; + + if (uplo == 1) { + for (i = 0; i < m; i++) { + j = m - i; + + aa = a + i; + bb = b + i * ldb; + if (transa & 1) { + aa = a + lda * i; + } + if (transb & 1) + bb = b + i; + cc = c + i * ldc + i; + +#if 0 + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + + IDEBUG_START; + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, IFLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + if (!(transa & 1)) + (gemv[(int)transa]) (j, k, alpha, aa, lda, + bb, incb, beta, cc, 1); + else + (gemv[(int)transa]) (k, j, alpha, aa, lda, + bb, incb, beta, cc, 1); + +#ifdef SMP + } else { + if (!(transa & 1)) + (gemv_thread[(int)transa]) (j, k, alpha, aa, + lda, bb, incb, beta, cc, + 1, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, beta, cc, + 1, nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } else { + + for (i = 0; i < m; i++) { + j = i + 1; + + bb = b + i * ldb; + if (transb & 1) { + bb = b + i; + } + cc = c + i * ldc; + +#if 0 + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + IDEBUG_START; + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, IFLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + if (!(transa & 1)) + (gemv[(int)transa]) (j, k, alpha, a, lda, bb, + incb, beta, cc, 1); + else + (gemv[(int)transa]) (k, j, alpha, a, lda, bb, + incb, beta, cc, 1); + +#ifdef SMP + } else { + if (!(transa & 1)) + (gemv_thread[(int)transa]) (j, k, alpha, a, lda, + bb, incb, beta, cc, 1, + nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, beta, cc, 1, + nthreads); + } +#endif + + STACK_FREE(buffer); + } + } + + IDEBUG_END; + + return; +} From 1ed69ea1c05d091ec4f38358939aad441b0b2d4a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Feb 2024 23:35:12 +0100 Subject: [PATCH 644/718] improve naming --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 0353a4d21..49bd1cbed 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1705,8 +1705,8 @@ endif endif ifeq ($(FIXED_LIBNAME),1) - LIBNAME = $(LIBPREFIX).$(LIBSUFFIX) - LIBNAME_P = $(LIBPREFIX)_p.$(LIBSUFFIX) + LIBNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX) + LIBNAME_P = $(LIBPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX) endif LIBDLLNAME = $(LIBPREFIX).dll From 9ef10ffa496b919c25aedbb4aa2fdb930901475a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Feb 2024 23:38:19 +0100 Subject: [PATCH 645/718] Handle prefixed and suffixed libnames, optionally suppress softlinking --- Makefile.install | 24 ++++++++++++++---------- openblas.pc.in | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/Makefile.install b/Makefile.install index 81f959177..e7d129cce 100644 --- a/Makefile.install +++ b/Makefile.install @@ -2,10 +2,12 @@ TOPDIR = . export GOTOBLAS_MAKEFILE = 1 -include $(TOPDIR)/Makefile.conf_last include ./Makefile.system +LNCMD = ln -fs ifdef THELIBNAME LIBNAME=$(THELIBNAME) LIBSONAME=$(THELIBSONAME) +LNCMD = true endif ifeq ($(INTERFACE64),1) USE_64BITINT=1 @@ -99,7 +101,7 @@ ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifneq ($(NO_SHARED),1) @@ -107,21 +109,21 @@ ifneq ($(NO_SHARED),1) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) @install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) + $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \ + $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so + $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ - ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib + $(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ + $(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" @@ -149,15 +151,15 @@ ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ - ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) + $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \ + $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif endif @@ -170,6 +172,8 @@ endif @echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)" + @echo 'libprefix='$(LIBPREFIX) >> "$(PKGFILE)" + @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" @@ -186,7 +190,7 @@ endif ifneq ($(NO_SHARED),1) #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" diff --git a/openblas.pc.in b/openblas.pc.in index 8ad6e8bee..6c27c462b 100644 --- a/openblas.pc.in +++ b/openblas.pc.in @@ -2,6 +2,6 @@ Name: openblas Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: ${version} URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas${libsuffix} +Libs: -L${libdir} -l$(libprefix}openblas${libnamesuffix} Libs.private: ${extralib} Cflags: -I${includedir} From 25b300bbeebb03f781a94fe0a655f59cd20094a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Feb 2024 23:40:01 +0100 Subject: [PATCH 646/718] improve internal names --- exports/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 71c112c67..238623b04 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -133,7 +133,7 @@ libgoto_hpl.def : $(GENSYM) ifeq ($(OSNAME), Darwin) ifeq ($(FIXED_LIBNAME),1) -INTERNALNAME = $(LIBPREFIX).dylib +INTERNALNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).dylib else INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib endif @@ -174,7 +174,7 @@ FEXTRALIB += -lm EXTRALIB += -lm else ifeq ($(FIXED_LIBNAME),1) -INTERNALNAME = $(LIBPREFIX).so +INTERNALNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).so else INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) endif From fb99fc2e6e4ec8ecdcfffe1ca1aeb787464d2825 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 7 Feb 2024 13:42:08 +0100 Subject: [PATCH 647/718] fix type conversion warnings --- test/compare_sgemm_sbgemm.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index cf808b56d..4afa8bf93 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -81,6 +81,16 @@ float16to32 (bfloat16_bits f16) return f32.v; } +float +float32to16 (float32_bits f32) +{ + bfloat16_bits f16; + f16.bits.s = f32.bits.s; + f16.bits.e = f32.bits.e; + f16.bits.m = (uint32_t) f32.bits.m >> 16; + return f32.v; +} + int main (int argc, char *argv[]) { @@ -108,16 +118,16 @@ main (int argc, char *argv[]) A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; - AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; - BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; + AA[j * k + i].v = float32to16( A[j * k + i] ); + BB[j * k + i].v = float32to16( B[j * k + i] ); CC[j * k + i] = 0; DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); - SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); + SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA, + &m, (bfloat16*)BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) for (j = 0; j < m; j++) if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) From 08ce6b1c1c6468c26607353d516e0dc8c009f58e Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 7 Feb 2024 07:54:06 -0600 Subject: [PATCH 648/718] Add missing CPU ID definitions for old versions of AIX. --- driver/others/dynamic_power.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 0454f186c..16320dc40 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -43,6 +43,13 @@ char *gotoblas_corename(void) { #define CPU_POWER9 9 #define CPU_POWER10 10 +#ifndef POWER_9 +#define POWER_9 0x20000 /* 9 class CPU */ +#endif +#ifndef POWER_10 +#define POWER_10 0x40000 /* 10 class CPU */ +#endif + #ifdef _AIX #include From e9f480111e1d5b6f69c8053f79375b0a4242712f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 7 Feb 2024 19:57:18 +0100 Subject: [PATCH 649/718] fix sbgemm bfloat16 conversion errors introduced in PR 4488 --- test/compare_sgemm_sbgemm.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 4afa8bf93..bc74233ab 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -81,16 +81,6 @@ float16to32 (bfloat16_bits f16) return f32.v; } -float -float32to16 (float32_bits f32) -{ - bfloat16_bits f16; - f16.bits.s = f32.bits.s; - f16.bits.e = f32.bits.e; - f16.bits.m = (uint32_t) f32.bits.m >> 16; - return f32.v; -} - int main (int argc, char *argv[]) { @@ -110,6 +100,8 @@ main (int argc, char *argv[]) float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; float DD[m * n], CC[m * n]; + bfloat16 atmp,btmp; + blasint one=1; for (j = 0; j < m; j++) { @@ -118,8 +110,10 @@ main (int argc, char *argv[]) A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; - AA[j * k + i].v = float32to16( A[j * k + i] ); - BB[j * k + i].v = float32to16( B[j * k + i] ); + sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); + sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); + AA[j * k + i].v = atmp; + BB[j * k + i].v = btmp; CC[j * k + i] = 0; DD[j * k + i] = 0; } From ff10e6b6dc30e247eb0cabd00de610b48c615c91 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Thu, 8 Feb 2024 00:19:54 +0300 Subject: [PATCH 650/718] Fix zero step tests --- utest/test_extensions/test_damin.c | 4 ++-- utest/test_extensions/test_dzamax.c | 7 +++---- utest/test_extensions/test_dzamin.c | 6 +++--- utest/test_extensions/test_idamin.c | 4 ++-- utest/test_extensions/test_isamin.c | 4 ++-- utest/test_extensions/test_samin.c | 6 +++--- utest/test_extensions/test_scamax.c | 7 +++---- utest/test_extensions/test_scamin.c | 6 +++--- 8 files changed, 21 insertions(+), 23 deletions(-) diff --git a/utest/test_extensions/test_damin.c b/utest/test_extensions/test_damin.c index d492343ed..fdd2bc658 100644 --- a/utest/test_extensions/test_damin.c +++ b/utest/test_extensions/test_damin.c @@ -60,12 +60,12 @@ CTEST(damin, step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; double x[ELEMENTS]; - for (i = 0; i < N * inc; i ++) { + for (i = 0; i < N; i ++) { x[i] = i + 1000; } x[8] = 0.0; double amin = BLASFUNC(damin)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(x[0], amin, DOUBLE_EPS); } /** diff --git a/utest/test_extensions/test_dzamax.c b/utest/test_extensions/test_dzamax.c index edea3de8f..bdb3a4f18 100644 --- a/utest/test_extensions/test_dzamax.c +++ b/utest/test_extensions/test_dzamax.c @@ -59,13 +59,12 @@ CTEST(dzamax, bad_args_N_0){ CTEST(dzamax, step_zero){ blasint i; blasint N = ELEMENTS * 2, inc = 0; - double x[ELEMENTS]; - for (i = 0; i < N * inc * 2; i ++) { + double x[ELEMENTS * 2]; + for (i = 0; i < N; i ++) { x[i] = i + 1000; } - x[8] = 0.0; double amax = BLASFUNC(dzamax)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL(0.0, amax, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amax, DOUBLE_EPS); } /** diff --git a/utest/test_extensions/test_dzamin.c b/utest/test_extensions/test_dzamin.c index 916eede92..9fcf87b7b 100644 --- a/utest/test_extensions/test_dzamin.c +++ b/utest/test_extensions/test_dzamin.c @@ -59,13 +59,13 @@ CTEST(dzamin, bad_args_N_0){ CTEST(dzamin, step_zero){ blasint i; blasint N = ELEMENTS * 2, inc = 0; - double x[ELEMENTS]; - for (i = 0; i < N * inc * 2; i ++) { + double x[ELEMENTS * 2]; + for (i = 0; i < N; i ++) { x[i] = i + 1000; } x[8] = 0.0; double amin = BLASFUNC(dzamin)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amin, DOUBLE_EPS); } /** diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c index 9f099f666..6a7ed9d1e 100644 --- a/utest/test_extensions/test_idamin.c +++ b/utest/test_extensions/test_idamin.c @@ -62,7 +62,7 @@ CTEST(idamin, step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; double x[ELEMENTS]; - for (i = 0; i < N * inc; i ++) { + for (i = 0; i < N; i ++) { x[i] = i + 1000; } x[8] = 0.0; @@ -435,7 +435,7 @@ CTEST(idamin, c_api_step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; double x[ELEMENTS]; - for (i = 0; i < N * inc; i ++) { + for (i = 0; i < N; i ++) { x[i] = i + 1000; } x[8] = 0.0; diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c index df8dead07..4ff235b83 100644 --- a/utest/test_extensions/test_isamin.c +++ b/utest/test_extensions/test_isamin.c @@ -62,7 +62,7 @@ CTEST(isamin, step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; float x[ELEMENTS]; - for (i = 0; i < N * inc; i ++) { + for (i = 0; i < N; i ++) { x[i] = i + 1000; } x[8] = 0.0f; @@ -435,7 +435,7 @@ CTEST(isamin, c_api_step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; float x[ELEMENTS]; - for (i = 0; i < N * inc; i ++) { + for (i = 0; i < N; i ++) { x[i] = i + 1000; } x[8] = 0.0f; diff --git a/utest/test_extensions/test_samin.c b/utest/test_extensions/test_samin.c index 5c747a0f6..2e3a73797 100644 --- a/utest/test_extensions/test_samin.c +++ b/utest/test_extensions/test_samin.c @@ -60,12 +60,12 @@ CTEST(samin, step_zero){ blasint i; blasint N = ELEMENTS, inc = 0; float x[ELEMENTS]; - for (i = 0; i < N * inc; i ++) { + for (i = 0; i < N; i ++) { x[i] = i + 1000; } - x[8] = 0.0f; + x[8] = 0.0; float amin = BLASFUNC(samin)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(x[0], amin, SINGLE_EPS); } /** diff --git a/utest/test_extensions/test_scamax.c b/utest/test_extensions/test_scamax.c index 39d7201ff..0f49ebfad 100644 --- a/utest/test_extensions/test_scamax.c +++ b/utest/test_extensions/test_scamax.c @@ -59,13 +59,12 @@ CTEST(scamax, bad_args_N_0){ CTEST(scamax, step_zero){ blasint i; blasint N = ELEMENTS * 2, inc = 0; - float x[ELEMENTS]; - for (i = 0; i < N * inc * 2; i ++) { + float x[ELEMENTS * 2]; + for (i = 0; i < N; i ++) { x[i] = i + 1000; } - x[8] = 0.0f; float amax = BLASFUNC(scamax)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL(0.0f, amax, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amax, SINGLE_EPS); } /** diff --git a/utest/test_extensions/test_scamin.c b/utest/test_extensions/test_scamin.c index 4baa23184..0f0414a1c 100644 --- a/utest/test_extensions/test_scamin.c +++ b/utest/test_extensions/test_scamin.c @@ -59,13 +59,13 @@ CTEST(scamin, bad_args_N_0){ CTEST(scamin, step_zero){ blasint i; blasint N = ELEMENTS * 2, inc = 0; - float x[ELEMENTS]; - for (i = 0; i < N * inc * 2; i ++) { + float x[ELEMENTS * 2]; + for (i = 0; i < N; i ++) { x[i] = i + 1000; } x[8] = 0.0f; float amin = BLASFUNC(scamin)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amin, SINGLE_EPS); } /** From ec5cfe3bc8e7fb4fae09e961ea6169e01cd21fa3 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Thu, 8 Feb 2024 00:21:38 +0300 Subject: [PATCH 651/718] Fix invalid tests --- utest/test_extensions/test_cgeadd.c | 6 +++--- utest/test_extensions/test_dgeadd.c | 6 +++--- utest/test_extensions/test_dimatcopy.c | 8 ++++---- utest/test_extensions/test_sgeadd.c | 6 +++--- utest/test_extensions/test_simatcopy.c | 8 ++++---- utest/test_extensions/test_zgeadd.c | 6 +++--- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/utest/test_extensions/test_cgeadd.c b/utest/test_extensions/test_cgeadd.c index 0cf6cbf87..9b87ad9f3 100644 --- a/utest/test_extensions/test_cgeadd.c +++ b/utest/test_extensions/test_cgeadd.c @@ -349,7 +349,7 @@ CTEST(cgeadd, xerbla_lda_invalid) blasint lda = INVALID; blasint ldc = 1; - int expected_info = 6; + int expected_info = 5; int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -682,7 +682,7 @@ CTEST(cgeadd, c_api_xerbla_n_invalid_row_major) blasint lda = 1; blasint ldc = 1; - int expected_info = 1; + int expected_info = 2; int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -730,7 +730,7 @@ CTEST(cgeadd, c_api_xerbla_m_invalid_row_major) blasint lda = 1; blasint ldc = 1; - int expected_info = 2; + int expected_info = 1; int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); diff --git a/utest/test_extensions/test_dgeadd.c b/utest/test_extensions/test_dgeadd.c index 4654c51a3..8f93a842e 100644 --- a/utest/test_extensions/test_dgeadd.c +++ b/utest/test_extensions/test_dgeadd.c @@ -346,7 +346,7 @@ CTEST(dgeadd, xerbla_lda_invalid) blasint lda = INVALID; blasint ldc = 1; - int expected_info = 6; + int expected_info = 5; int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -680,7 +680,7 @@ CTEST(dgeadd, c_api_xerbla_n_invalid_row_major) blasint lda = 1; blasint ldc = 1; - int expected_info = 1; + int expected_info = 2; int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -728,7 +728,7 @@ CTEST(dgeadd, c_api_xerbla_m_invalid_row_major) blasint lda = 1; blasint ldc = 1; - int expected_info = 2; + int expected_info = 1; int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c index d2a16bbbf..811c356b3 100644 --- a/utest/test_extensions/test_dimatcopy.c +++ b/utest/test_extensions/test_dimatcopy.c @@ -856,7 +856,7 @@ CTEST(dimatcopy, xerbla_rowmajor_notrans_invalid_ldb) blasint lda_src = 100, lda_dst = 50; char order = 'R'; char trans = 'N'; - int expected_info = 9; + int expected_info = 8; int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -873,7 +873,7 @@ CTEST(dimatcopy, xerbla_rowmajor_trans_invalid_ldb) blasint lda_src = 100, lda_dst = 50; char order = 'R'; char trans = 'T'; - int expected_info = 9; + int expected_info = 8; int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -890,7 +890,7 @@ CTEST(dimatcopy, xerbla_colmajor_notrans_invalid_ldb) blasint lda_src = 100, lda_dst = 50; char order = 'C'; char trans = 'N'; - int expected_info = 9; + int expected_info = 8; int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -907,7 +907,7 @@ CTEST(dimatcopy, xerbla_colmajor_trans_invalid_ldb) blasint lda_src = 100, lda_dst = 50; char order = 'C'; char trans = 'T'; - int expected_info = 9; + int expected_info = 8; int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); diff --git a/utest/test_extensions/test_sgeadd.c b/utest/test_extensions/test_sgeadd.c index b42ce9c0e..171132b9d 100644 --- a/utest/test_extensions/test_sgeadd.c +++ b/utest/test_extensions/test_sgeadd.c @@ -349,7 +349,7 @@ CTEST(sgeadd, xerbla_lda_invalid) blasint lda = INVALID; blasint ldc = 1; - int expected_info = 6; + int expected_info = 5; int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -682,7 +682,7 @@ CTEST(sgeadd, c_api_xerbla_n_invalid_row_major) blasint lda = 1; blasint ldc = 1; - int expected_info = 1; + int expected_info = 2; int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -730,7 +730,7 @@ CTEST(sgeadd, c_api_xerbla_m_invalid_row_major) blasint lda = 1; blasint ldc = 1; - int expected_info = 2; + int expected_info = 1; int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c index cf14d360c..ba388596d 100644 --- a/utest/test_extensions/test_simatcopy.c +++ b/utest/test_extensions/test_simatcopy.c @@ -856,7 +856,7 @@ CTEST(simatcopy, xerbla_rowmajor_notrans_invalid_ldb) blasint lda_src = 100, lda_dst = 50; char order = 'R'; char trans = 'N'; - int expected_info = 9; + int expected_info = 8; int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -873,7 +873,7 @@ CTEST(simatcopy, xerbla_rowmajor_trans_invalid_ldb) blasint lda_src = 100, lda_dst = 50; char order = 'R'; char trans = 'T'; - int expected_info = 9; + int expected_info = 8; int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -890,7 +890,7 @@ CTEST(simatcopy, xerbla_colmajor_notrans_invalid_ldb) blasint lda_src = 100, lda_dst = 50; char order = 'C'; char trans = 'N'; - int expected_info = 9; + int expected_info = 8; int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -907,7 +907,7 @@ CTEST(simatcopy, xerbla_colmajor_trans_invalid_ldb) blasint lda_src = 100, lda_dst = 50; char order = 'C'; char trans = 'T'; - int expected_info = 9; + int expected_info = 8; int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info); ASSERT_EQUAL(TRUE, passed); diff --git a/utest/test_extensions/test_zgeadd.c b/utest/test_extensions/test_zgeadd.c index e50f86de0..7496ccf88 100644 --- a/utest/test_extensions/test_zgeadd.c +++ b/utest/test_extensions/test_zgeadd.c @@ -349,7 +349,7 @@ CTEST(zgeadd, xerbla_lda_invalid) blasint lda = INVALID; blasint ldc = 1; - int expected_info = 6; + int expected_info = 5; int passed = check_badargs('F', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -682,7 +682,7 @@ CTEST(zgeadd, c_api_xerbla_n_invalid_row_major) blasint lda = 1; blasint ldc = 1; - int expected_info = 1; + int expected_info = 2; int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); @@ -730,7 +730,7 @@ CTEST(zgeadd, c_api_xerbla_m_invalid_row_major) blasint lda = 1; blasint ldc = 1; - int expected_info = 2; + int expected_info = 1; int passed = check_badargs('C', order, m, n, lda, ldc, expected_info); ASSERT_EQUAL(TRUE, passed); From cfabc48190bb3ac1b5c6ace9ee560477394054c8 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Thu, 8 Feb 2024 00:22:15 +0300 Subject: [PATCH 652/718] Update rotg tests --- utest/test_extensions/test_crotg.c | 24 ++++++++++++------------ utest/test_extensions/test_zrotg.c | 16 ++++++++-------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/utest/test_extensions/test_crotg.c b/utest/test_extensions/test_crotg.c index 9db7dc7d3..84875ccf7 100644 --- a/utest/test_extensions/test_crotg.c +++ b/utest/test_extensions/test_crotg.c @@ -48,10 +48,10 @@ CTEST(crotg, zero_a) float sc; BLASFUNC(crotg)(sa, sb, &sc, ss); ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0f, ss[0], SINGLE_EPS); - ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.70711f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, sa[1], SINGLE_EPS); } /** @@ -83,8 +83,8 @@ CTEST(crotg, zero_real) float ss[2]; float sc; BLASFUNC(crotg)(sa, sb, &sc, ss); - ASSERT_DBL_NEAR_TOL(0.7071f, sc, SINGLE_EPS); - ASSERT_DBL_NEAR_TOL(0.7071f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70711f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS); @@ -174,10 +174,10 @@ CTEST(crotg, c_api_zero_a) float sc; cblas_crotg(sa, sb, &sc, ss); ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0f, ss[0], SINGLE_EPS); - ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.70711f, ss[1], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421f, sa[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0f, sa[1], SINGLE_EPS); } /** @@ -209,8 +209,8 @@ CTEST(crotg, c_api_zero_real) float ss[2]; float sc; cblas_crotg(sa, sb, &sc, ss); - ASSERT_DBL_NEAR_TOL(0.7071f, sc, SINGLE_EPS); - ASSERT_DBL_NEAR_TOL(0.7071f, ss[0], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70711f, sc, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS); diff --git a/utest/test_extensions/test_zrotg.c b/utest/test_extensions/test_zrotg.c index 310121422..1de95447d 100644 --- a/utest/test_extensions/test_zrotg.c +++ b/utest/test_extensions/test_zrotg.c @@ -48,10 +48,10 @@ CTEST(zrotg, zero_a) double sc; BLASFUNC(zrotg)(sa, sb, &sc, ss); ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0, ss[0], DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70710678118655, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.70710678118655, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421356237310, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, sa[1], DOUBLE_EPS); } /** @@ -174,10 +174,10 @@ CTEST(zrotg, c_api_zero_a) double sc; cblas_zrotg(sa, sb, &sc, ss); ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0, ss[0], DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS); - ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.70710678118655, ss[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(-0.70710678118655, ss[1], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(1.41421356237310, sa[0], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(0.0, sa[1], DOUBLE_EPS); } /** From b3fa16345d83b723b8984b78dc6a2bb5d9f3d479 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 13:15:34 +0100 Subject: [PATCH 653/718] fix prototype for c/zaxpby --- common_interface.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common_interface.h b/common_interface.h index 61a82c306..5a2e1654c 100644 --- a/common_interface.h +++ b/common_interface.h @@ -773,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *); +void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *); void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); From 500ac4de5e20596d5cd797d745db97dd0a62ff86 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 13:18:34 +0100 Subject: [PATCH 654/718] fix incompatible pointer types --- interface/zaxpby.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/interface/zaxpby.c b/interface/zaxpby.c index 3a4db7403..e5065270d 100644 --- a/interface/zaxpby.c +++ b/interface/zaxpby.c @@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef CBLAS -void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) +void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY) { blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; + FLOAT* ALPHA = (FLOAT*) VALPHA; + FLOAT* BETA = (FLOAT*) VBETA; #else From ac6b4b7aa472799245c497ab3caa6e77c0e80852 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 8 Feb 2024 08:56:30 -0600 Subject: [PATCH 655/718] Make sure CPU ID works for all POWER_10 conditions --- driver/others/dynamic_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 16320dc40..cd5e88922 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -69,7 +69,7 @@ static int cpuid(void) else if (arch == POWER_9) return CPU_POWER9; #endif #ifdef POWER_10 - else if (arch == POWER_10) return CPU_POWER10; + else if (arch >= POWER_10) return CPU_POWER10; #endif return CPU_UNKNOWN; } From d408ecedba8cc1d9b799b89df6996bdcd996d5fc Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 8 Feb 2024 12:17:18 -0600 Subject: [PATCH 656/718] Add environment variable to display coretype for dynamic arch. --- driver/others/dynamic_power.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index cd5e88922..4c1f4a26e 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -339,6 +339,9 @@ void gotoblas_dynamic_init(void) { if (gotoblas && gotoblas -> init) { strncpy(coren,gotoblas_corename(),20); sprintf(coremsg, "Core: %s\n",coren); + if (getenv("GET_OPENBLAS_CORETYPE")) { + fprintf(stderr, "%s", coremsg); + } openblas_warning(2, coremsg); gotoblas -> init(); } else { From 83bec5135534c94fe1790aa5795620a7cdeaa916 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 21:23:48 +0100 Subject: [PATCH 657/718] Update with recently added CBLAS interfaces and LAPACK/LAPACKE functions --- exports/gensymbol | 85 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/exports/gensymbol b/exports/gensymbol index 704eab06f..93edf2400 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -60,6 +60,7 @@ cblasobjsc=" cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy + cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin " cblasobjsd=" cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot @@ -69,6 +70,7 @@ cblasobjsd=" cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy + cblas_damax cblas_damin " cblasobjss=" @@ -80,6 +82,7 @@ cblasobjss=" cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm cblas_strsv cblas_sgeadd cblas_sgemmt cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy + cblas_samax cblas_samin " cblasobjsz=" @@ -91,6 +94,7 @@ cblasobjsz=" cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy + cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin " cblasobjs="cblas_xerbla" @@ -861,6 +865,51 @@ lapackobjs2z="$lapackobjs2z zgedmd zgedmdq " + +#functions added post 3.11 + +lapackobjs2c="$lapackobjs2c + claqp2rk + claqp3rk + claqz0 + claqz1 + claqz2 + claqz3 + clatrs3 + ctrsyl3 + " +lapackobjs2d="$lapackobjs2d + dgelqs + dgelst + dgeqp3rk + dgeqrs + dlaqp2rk + dlaqp3rk + dlaqz0 + dlaqz1 + dlaqz2 + dlaqz3 + dlaqz4 + dlarmm + dlatrs3 + dtrsyl3 + " +lapackobjs2z="$lapackobjs2z + zgelqs + zgelst + zgeqp3rk + zgeqrs + zlaqp2rk + zlaqp3rk + zlaqz0 + zlaqz1 + zlaqz2 + zlaqz3 + zlatrs3 + zrscl + ztrsyl3 +" + lapack_extendedprecision_objs=" zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx @@ -1622,6 +1671,14 @@ lapackeobjsc=" LAPACKE_cgetsqrhrt_work LAPACKE_cungtsqr_row LAPACKE_cungtsqr_row_work + LAPACKE_clangb + LAPACKE_clangb_work + LAPACKE_ctrsyl3 + LAPACKE_ctrsyl3_work + LAPACKE_ctz_nancheck + LAPACKE_ctz_trans + LAPACKE_cunhr_col + LAPACKE_cunhr_col_work " lapackeobjsd=" @@ -2239,6 +2296,14 @@ lapackeobjsd=" LAPACKE_dgetsqrhrt_work LAPACKE_dorgtsqr_row LAPACKE_dorgtsqr_row_work + LAPACKE_dlangb + LAPACKE_dlangb_work + LAPACKE_dorhr_col + LAPACKE_dorhr_col_work + LAPACKE_dtrsyl3 + LAPACKE_dtrsyl3_work + LAPACKE_dtz_nancheck + LAPACKE_dtz_trans " lapackeobjss=" @@ -2848,6 +2913,14 @@ lapackeobjss=" LAPACKE_sgetsqrhrt_work LAPACKE_sorgtsqr_row LAPACKE_sorgtsqr_row_work + LAPACKE_slangb + LAPACKE_slangb_work + LAPACKE_sorhr_col + LAPACKE_sorhr_col_work + LAPACKE_strsyl3 + LAPACKE_strsyl3_work + LAPACKE_stz_nancheck + LAPACKE_stz_trans " lapackeobjsz=" @@ -3515,6 +3588,14 @@ lapackeobjsz=" LAPACKE_zgetsqrhrt_work LAPACKE_zungtsqr_row LAPACKE_zungtsqr_row_work + LAPACKE_zlangb + LAPACKE_zlangb_work + LAPACKE_ztrsyl3 + LAPACKE_ztrsyl3_work + LAPACKE_ztz_nancheck + LAPACKE_ztz_trans + LAPACKE_zunhr_col + LAPACKE_zunhr_col_work " ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the @@ -3616,6 +3697,7 @@ lapack_embeded_underscore_objs_s=" ssysv_aa_2stage ssytrf_aa_2stage ssytrs_aa_2stage slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col + slarfb_gett " lapack_embeded_underscore_objs_c=" chetf2_rook chetrf_rook chetri_rook @@ -3641,6 +3723,7 @@ lapack_embeded_underscore_objs_c=" csysv_aa_2stage csytrf_aa_2stage csytrs_aa_2stage claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col + clarfb_gett " lapack_embeded_underscore_objs_d=" dlasyf_rook @@ -3658,6 +3741,7 @@ lapack_embeded_underscore_objs_d=" dsysv_aa_2stage dsytrf_aa_2stage dsytrs_aa_2stage dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col + dlarfb_gett " lapack_embeded_underscore_objs_z=" zhetf2_rook zhetrf_rook zhetri_rook @@ -3682,6 +3766,7 @@ lapack_embeded_underscore_objs_z=" zhetrs_aa_2stage zsysv_aa_2stage zsytrf_aa_2stage zsytrs_aa_2stage zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col + zlarfb_gett " dirname=`pwd -P`/../lapack-netlib From 98c56a7314dbc0032152b7658c73c203124963f9 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Thu, 8 Feb 2024 13:50:15 -0800 Subject: [PATCH 658/718] more cleanup --- driver/others/blas_server_win32.c | 35 ++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index ee6d08f8c..89ce9e656 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -73,7 +73,7 @@ static DWORD blas_threads_id[MAX_CPU_NUMBER]; static volatile int thread_target; // target num of live threads, volatile for cross-thread reads // -// +// Legacy code path // static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { @@ -369,11 +369,11 @@ int blas_thread_init(void) { return 0; } -/* - User can call one of two routines. - exec_blas_async ... immediately returns after jobs are queued. - exec_blas ... returns after jobs are finished. -*/ +// +// User can call one of two routines. +// exec_blas_async ... immediately returns after jobs are queued. +// exec_blas ... returns after jobs are finished. +// int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { #if defined(SMP_SERVER) @@ -471,27 +471,32 @@ int exec_blas(BLASLONG num, blas_queue_t *queue) { if ((num <= 0) || (queue == NULL)) return 0; - if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); + if ((num > 1) && queue -> next) + exec_blas_async(1, queue -> next); routine = queue -> routine; if (queue -> mode & BLAS_LEGACY) { legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); - } else + } else { if (queue -> mode & BLAS_PTHREAD) { void (*pthreadcompat)(void *) = queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, - queue -> sa, queue -> sb, 0); + queue -> sa, queue -> sb, 0); + } - if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + if ((num > 1) && queue -> next) + exec_blas_async_wait(num - 1, queue -> next); return 0; } +// // Shutdown procedure, but user don't have to call this routine. The // kernel automatically kill threads. +// int BLASFUNC(blas_thread_shutdown)(void) { int i; @@ -502,7 +507,7 @@ int BLASFUNC(blas_thread_shutdown)(void) { if (blas_server_avail) { - for(i = 0; i < blas_num_threads - 1; i++) { + for (i = 0; i < blas_num_threads - 1; i++) { // Could also just use WaitForMultipleObjects DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); @@ -524,6 +529,9 @@ int BLASFUNC(blas_thread_shutdown)(void) { return 0; } +// +// Legacy function to set numbef of threads +// void goto_set_num_threads(int num_threads) { long i; @@ -577,7 +585,7 @@ void goto_set_num_threads(int num_threads) blas_server_avail = 1; } - for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { + for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { //MT_TRACE("set_num_threads: creating thread [%d]\n", i); blas_threads[i] = CreateThread(NULL, 0, @@ -593,6 +601,9 @@ void goto_set_num_threads(int num_threads) blas_cpu_number = num_threads; } +// +// Openblas function to set thread count +// void openblas_set_num_threads(int num) { goto_set_num_threads(num); From 93872f4681430a6ae9d857d8044d36ba98d09bf2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Feb 2024 23:02:09 +0100 Subject: [PATCH 659/718] drop the ?laqz? symbols for now (not translatable by f2c) --- exports/gensymbol | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 93edf2400..226035842 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -871,13 +871,14 @@ lapackobjs2z="$lapackobjs2z lapackobjs2c="$lapackobjs2c claqp2rk claqp3rk - claqz0 - claqz1 - claqz2 - claqz3 - clatrs3 ctrsyl3 " +# claqz0 +# claqz1 +# claqz2 +# claqz3 +# clatrs3 + lapackobjs2d="$lapackobjs2d dgelqs dgelst @@ -885,15 +886,16 @@ lapackobjs2d="$lapackobjs2d dgeqrs dlaqp2rk dlaqp3rk - dlaqz0 - dlaqz1 - dlaqz2 - dlaqz3 - dlaqz4 dlarmm dlatrs3 dtrsyl3 " +# dlaqz0 +# dlaqz1 +# dlaqz2 +# dlaqz3 +# dlaqz4 + lapackobjs2z="$lapackobjs2z zgelqs zgelst @@ -901,14 +903,14 @@ lapackobjs2z="$lapackobjs2z zgeqrs zlaqp2rk zlaqp3rk - zlaqz0 - zlaqz1 - zlaqz2 - zlaqz3 zlatrs3 zrscl ztrsyl3 -" + " +# zlaqz0 +# zlaqz1 +# zlaqz2 +# zlaqz3 lapack_extendedprecision_objs=" zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx From ff1523163f71e1ad9f1e4a2c8548416f213e7cb1 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Fri, 9 Feb 2024 12:59:14 +0000 Subject: [PATCH 660/718] Fix axpy test hangs when n==0. Reenable zaxpy_vector kernel for C910V. --- kernel/riscv64/KERNEL.C910V | 4 ++-- kernel/riscv64/axpy_vector.c | 2 +- kernel/riscv64/zaxpy_vector.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 066329390..2798a870e 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c SAXPYKERNEL = axpy_vector.c DAXPYKERNEL = axpy_vector.c -CAXPYKERNEL = zaxpy.c -ZAXPYKERNEL = zaxpy.c +CAXPYKERNEL = zaxpy_vector.c +ZAXPYKERNEL = zaxpy_vector.c SAXPBYKERNEL = axpby_vector.c DAXPBYKERNEL = axpby_vector.c diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index e99ca8542..6dffe5f09 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -65,7 +65,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS FLOAT_V_T vy0, vy1; BLASLONG stride_x, stride_y; - if (n < 0) return(0); + if (n <= 0) return(0); if (da == 0.0) return(0); if (inc_x == 1 && inc_y == 1) { diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index d19e51118..1e766c5f4 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -45,10 +45,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - BLASLONG ix = 0,iy = 0; - if(n < 0) return(0); - if(da_r == 0.0 && da_i == 0.0) return(0); + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + if(n <= 0) return(0); + if(da_r == 0.0 && da_i == 0.0) return(0); unsigned int gvl = 0; BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); From bb96e466aee3a969bfb767d347262977bc918658 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Feb 2024 15:50:11 +0100 Subject: [PATCH 661/718] Introduce LIBNAMEPREFIX to avoid messing with the internal LIBPREFIX --- Makefile.install | 4 ++-- Makefile.rule | 5 +++++ Makefile.system | 12 ++++++++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/Makefile.install b/Makefile.install index e7d129cce..2b00db4be 100644 --- a/Makefile.install +++ b/Makefile.install @@ -172,7 +172,7 @@ endif @echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)" - @echo 'libprefix='$(LIBPREFIX) >> "$(PKGFILE)" + @echo 'libprefix='$(LIBNAMEPREFIX) >> "$(PKGFILE)" @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" @@ -190,7 +190,7 @@ endif ifneq ($(NO_SHARED),1) #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(LIBNAMESUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" diff --git a/Makefile.rule b/Makefile.rule index 8dbf5eab6..02ec739fd 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -5,6 +5,11 @@ # This library's version VERSION = 0.3.26.dev +# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a +# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library +# +# LIBNAMEPREFIX = scipy + # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # is libopenblas_$(LIBNAMESUFFIX).so.0. diff --git a/Makefile.system b/Makefile.system index 49bd1cbed..233663ef9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1516,6 +1516,10 @@ ifndef LIBSONAMEBASE LIBSONAMEBASE = openblas endif +ifndef LIBNAMEPREFIX +LIBNAMEPREFIX = +endif + ifndef LIBNAMESUFFIX LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) else @@ -1523,9 +1527,9 @@ LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) endif ifeq ($(OSNAME), CYGWIN_NT) -LIBPREFIX = cyg$(LIBNAMEBASE) +LIBPREFIX = cyg$(LIBNAMEPREFIX)$(LIBNAMEBASE) else -LIBPREFIX = lib$(LIBNAMEBASE) +LIBPREFIX = lib$(LIBNAMEPREFIX)$(LIBNAMEBASE) endif KERNELDIR = $(TOPDIR)/kernel/$(ARCH) @@ -1705,8 +1709,8 @@ endif endif ifeq ($(FIXED_LIBNAME),1) - LIBNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX) - LIBNAME_P = $(LIBPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX) + LIBNAME = lib$(LIBNAMEPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX) + LIBNAME_P = lib$(LIBNAMEPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX) endif LIBDLLNAME = $(LIBPREFIX).dll From 10ea3fb7421b4beeddbaff9e759cdcf873c4b369 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Feb 2024 17:09:55 +0100 Subject: [PATCH 662/718] fix duplication of name parts --- exports/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 238623b04..e8dea0364 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -133,7 +133,7 @@ libgoto_hpl.def : $(GENSYM) ifeq ($(OSNAME), Darwin) ifeq ($(FIXED_LIBNAME),1) -INTERNALNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).dylib +INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).dylib else INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib endif @@ -174,7 +174,7 @@ FEXTRALIB += -lm EXTRALIB += -lm else ifeq ($(FIXED_LIBNAME),1) -INTERNALNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).so +INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).so else INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) endif From 4c554bd527cc3b8ed0c160cee457e23bfe442343 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Sat, 10 Feb 2024 00:46:52 +0300 Subject: [PATCH 663/718] check abs zero inc --- utest/test_extensions/test_damin.c | 4 ++-- utest/test_extensions/test_dzamax.c | 4 ++-- utest/test_extensions/test_dzamin.c | 4 ++-- utest/test_extensions/test_samin.c | 4 ++-- utest/test_extensions/test_scamax.c | 4 ++-- utest/test_extensions/test_scamin.c | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/utest/test_extensions/test_damin.c b/utest/test_extensions/test_damin.c index fdd2bc658..736921fa3 100644 --- a/utest/test_extensions/test_damin.c +++ b/utest/test_extensions/test_damin.c @@ -61,11 +61,11 @@ CTEST(damin, step_zero){ blasint N = ELEMENTS, inc = 0; double x[ELEMENTS]; for (i = 0; i < N; i ++) { - x[i] = i + 1000; + x[i] = i - 1000; } x[8] = 0.0; double amin = BLASFUNC(damin)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL(x[0], amin, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(fabs(x[0]), amin, DOUBLE_EPS); } /** diff --git a/utest/test_extensions/test_dzamax.c b/utest/test_extensions/test_dzamax.c index bdb3a4f18..7bc0200c9 100644 --- a/utest/test_extensions/test_dzamax.c +++ b/utest/test_extensions/test_dzamax.c @@ -61,10 +61,10 @@ CTEST(dzamax, step_zero){ blasint N = ELEMENTS * 2, inc = 0; double x[ELEMENTS * 2]; for (i = 0; i < N; i ++) { - x[i] = i + 1000; + x[i] = i - 1000; } double amax = BLASFUNC(dzamax)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amax, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL((fabs(x[0]) + fabs(x[1])), amax, DOUBLE_EPS); } /** diff --git a/utest/test_extensions/test_dzamin.c b/utest/test_extensions/test_dzamin.c index 9fcf87b7b..549881fdc 100644 --- a/utest/test_extensions/test_dzamin.c +++ b/utest/test_extensions/test_dzamin.c @@ -61,11 +61,11 @@ CTEST(dzamin, step_zero){ blasint N = ELEMENTS * 2, inc = 0; double x[ELEMENTS * 2]; for (i = 0; i < N; i ++) { - x[i] = i + 1000; + x[i] = i - 1000; } x[8] = 0.0; double amin = BLASFUNC(dzamin)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amin, DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL((fabs(x[0]) + fabs(x[1])), amin, DOUBLE_EPS); } /** diff --git a/utest/test_extensions/test_samin.c b/utest/test_extensions/test_samin.c index 2e3a73797..fd34d462a 100644 --- a/utest/test_extensions/test_samin.c +++ b/utest/test_extensions/test_samin.c @@ -61,11 +61,11 @@ CTEST(samin, step_zero){ blasint N = ELEMENTS, inc = 0; float x[ELEMENTS]; for (i = 0; i < N; i ++) { - x[i] = i + 1000; + x[i] = i - 1000; } x[8] = 0.0; float amin = BLASFUNC(samin)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL(x[0], amin, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(fabsf(x[0]), amin, SINGLE_EPS); } /** diff --git a/utest/test_extensions/test_scamax.c b/utest/test_extensions/test_scamax.c index 0f49ebfad..8c214ddff 100644 --- a/utest/test_extensions/test_scamax.c +++ b/utest/test_extensions/test_scamax.c @@ -61,10 +61,10 @@ CTEST(scamax, step_zero){ blasint N = ELEMENTS * 2, inc = 0; float x[ELEMENTS * 2]; for (i = 0; i < N; i ++) { - x[i] = i + 1000; + x[i] = i - 1000; } float amax = BLASFUNC(scamax)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amax, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL((fabsf(x[0]) + fabsf(x[1])), amax, SINGLE_EPS); } /** diff --git a/utest/test_extensions/test_scamin.c b/utest/test_extensions/test_scamin.c index 0f0414a1c..507548f2a 100644 --- a/utest/test_extensions/test_scamin.c +++ b/utest/test_extensions/test_scamin.c @@ -61,11 +61,11 @@ CTEST(scamin, step_zero){ blasint N = ELEMENTS * 2, inc = 0; float x[ELEMENTS * 2]; for (i = 0; i < N; i ++) { - x[i] = i + 1000; + x[i] = i - 1000; } x[8] = 0.0f; float amin = BLASFUNC(scamin)(&N, x, &inc); - ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amin, SINGLE_EPS); + ASSERT_DBL_NEAR_TOL((fabsf(x[0]) + fabsf(x[1])), amin, SINGLE_EPS); } /** From 5e9ead09ac03f27f7906576d4ec16cb80dc9fb4d Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Sat, 10 Feb 2024 00:47:25 +0300 Subject: [PATCH 664/718] fix info return --- interface/geadd.c | 4 ++-- interface/zgeadd.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/interface/geadd.c b/interface/geadd.c index 3a0ea015d..a2e6d1081 100644 --- a/interface/geadd.c +++ b/interface/geadd.c @@ -117,8 +117,8 @@ void CNAME(enum CBLAS_ORDER order, if (ldc < MAX(1, m)) info = 8; if (lda < MAX(1, m)) info = 5; - if (n < 0) info = 2; - if (m < 0) info = 1; + if (n < 0) info = 1; + if (m < 0) info = 2; } if (info >= 0) { diff --git a/interface/zgeadd.c b/interface/zgeadd.c index 7124cf230..de71f27b8 100644 --- a/interface/zgeadd.c +++ b/interface/zgeadd.c @@ -66,7 +66,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, info = 0; - if (lda < MAX(1, m)) info = 6; + if (lda < MAX(1, m)) info = 5; if (ldc < MAX(1, m)) info = 8; if (n < 0) info = 2; @@ -115,8 +115,8 @@ void CNAME(enum CBLAS_ORDER order, if (ldc < MAX(1, m)) info = 8; if (lda < MAX(1, m)) info = 5; - if (n < 0) info = 2; - if (m < 0) info = 1; + if (n < 0) info = 1; + if (m < 0) info = 2; } if (info >= 0) { From c6f30fd4146258dcdfc7cf1322ad1e0b4f88e1ac Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Sat, 10 Feb 2024 00:48:07 +0300 Subject: [PATCH 665/718] check for zero inc --- interface/max.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/interface/max.c b/interface/max.c index 6c7d32bd9..7817601b9 100644 --- a/interface/max.c +++ b/interface/max.c @@ -46,6 +46,12 @@ #ifdef USE_ABS +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + #ifndef USE_MIN /* ABS & MAX */ @@ -92,6 +98,8 @@ #else +#define ABS + #ifndef USE_MIN /* MAX */ @@ -130,6 +138,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ if (n <= 0) return 0; +#ifndef COMPLEX + if (incx == 0) return (ABS(*x)); +#else + if (incx == 0) return (ABS(*x) + ABS(*(x+1))); +#endif + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -158,6 +172,12 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ if (n <= 0) return 0; +#ifndef COMPLEX + if (incx == 0) return (ABS(*x)); +#else + if (incx == 0) return (ABS(*x) + ABS(*(x+1))); +#endif + IDEBUG_START; FUNCTION_PROFILE_START(); From 7e9b1c08074d1e88f446fc5861f44c8844e93d30 Mon Sep 17 00:00:00 2001 From: kseniyazaytseva Date: Sat, 10 Feb 2024 00:49:42 +0300 Subject: [PATCH 666/718] fix uninitialized data usage --- interface/zrotg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/zrotg.c b/interface/zrotg.c index ea73352dd..8acc3c9b2 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -102,7 +102,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { if (ada >= h *safmin) { *C = sqrt(ada/h); *R = *DA / *C; - *(R+1) = *(DA+1) / *(C+1); + *(R+1) = *(DA+1) / *C; rtmax *= 2.; if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow *S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq); @@ -115,7 +115,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { *C = ada / adahsq; if (*C >= safmin) { *R = *DA / *C; - *(R+1) = *(DA+1) / *(C+1); + *(R+1) = *(DA+1) / *C; } else { *R = *DA * (h / adahsq); *(R+1) = *(DA+1) * (h / adahsq); From 8698f9e37f63bd0a57b8182aaecb46cac2adf620 Mon Sep 17 00:00:00 2001 From: Dmitry Mikushin Date: Sat, 10 Feb 2024 19:12:16 +0100 Subject: [PATCH 667/718] Adding basic support of benchmarks into CMake for single, double, single complex and double complex cases. Each benchmarking target has a suffix to identify the data type, for example ./benchmark_gemm3m_COMPLEX_DOUBLE is a gemm3m.c source compiled with COMPLEX and DOUBLE macros defined --- CMakeLists.txt | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a1e4b271..059035ac3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -452,6 +452,34 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") endif() endif() +if (BUILD_TESTING) + find_package(OpenMP REQUIRED) + file(GLOB SOURCES "benchmark/*.c") + foreach(source ${SOURCES}) + get_filename_component(name ${source} NAME_WE) + if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper")) + set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE") + foreach(define ${defines}) + set(target_name "benchmark_${name}") + if (NOT "${define}" STREQUAL "DEFAULT") + string(JOIN "_" define_str ${define}) + set(target_name "${target_name}_${define_str}") + endif() + if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND + (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE")) + add_executable(${target_name} ${source}) + target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) + if (NOT "${define}" STREQUAL "DEFAULT") + target_compile_definitions(${target_name} PRIVATE ${define}) + endif() + endif() + endforeach() + endif() + endforeach() +endif() # Install project From d0f5dc763be1783874c5fe5c84c0eef90e73a6d1 Mon Sep 17 00:00:00 2001 From: Dmitry Mikushin Date: Mon, 12 Feb 2024 02:18:03 +0100 Subject: [PATCH 668/718] Adding USE_GEMM3M macro to kernel targets, so that the *gemm3m functions and parameters can be included into the gotoblas structure. Fixes #4500 --- kernel/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 60314eedb..74e6760c2 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1349,6 +1349,9 @@ endif () set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}") get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES) set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}") + if (USE_GEMM3M) + target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) + endif() endfunction () From 10548a0460d0b6abd160e69cd7ca727d41681584 Mon Sep 17 00:00:00 2001 From: Mark Seminatore Date: Mon, 12 Feb 2024 10:22:12 -0800 Subject: [PATCH 669/718] update contributors --- CONTRIBUTORS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 493747052..8f7abc5f8 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -218,4 +218,6 @@ In chronological order: * [2022-08] Fix building from sources for QNX * Mark Seminatore - * [2023-11-09] Improve Windows threading performance scaling \ No newline at end of file + * [2023-11-09] Improve Windows threading performance scaling + * [2024-02-09] Introduce MT_TRACE facility and improve code consistency + \ No newline at end of file From a28afac791853e93c8462199d7ce18694ccc0a2c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Feb 2024 11:48:33 +0100 Subject: [PATCH 670/718] Add FIXED_LIBNAME, LIBNAMEPREFIX and LIBNAMESUFFIX --- CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a1e4b271..a4440ee1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,11 @@ option(USE_PERL "Use the older PERL scripts for build preparation instead of uni option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON) +option(FIXED_LIBNAME "Use a non-versioned name for the library and no symbolic linking to variant names" OFF) + +set(LIBNAMEPREFIX "" CACHE STRING "Add a prefix to the openblas part of the library name" ) +set(LIBNAMESUFFIX "" CACHE STRING "Add a suffix after the openblas part of the library name" ) + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) else() @@ -96,7 +101,7 @@ message(WARNING "CMake support is experimental. It does not yet support all buil include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") -set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE}) +set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) set(BLASDIRS interface driver/level2 driver/level3 driver/others) @@ -336,11 +341,12 @@ endif() add_subdirectory(cpp_thread_test) endif() +if (NOT FIXED_LIBNAME) set_target_properties(${OpenBLAS_LIBS} PROPERTIES VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION} ) - +endif() if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) if (NOT MSVC) target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") From ffbfc3c692c44029eda64a5bc2cc1292bac5572e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Feb 2024 12:16:34 +0100 Subject: [PATCH 671/718] Add libname prefix and suffix --- cmake/openblas.pc.in | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 11e5606e5..9526d2df6 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -1,4 +1,6 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +libnameprefix=@LIBNAMEPREFIX@ +libnamesuffix=@LIBNAMESUFFIX@ libsuffix=@SUFFIX64_UNDERSCORE@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ @@ -7,5 +9,5 @@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OpenBLAS_VERSION@ URL: https://github.com/OpenMathLib/OpenBLAS -Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} +Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} Cflags: -I${includedir} From a0e3f77e0bdbaab44ac86076e15ad15c65c15106 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Feb 2024 12:17:38 +0100 Subject: [PATCH 672/718] add FIXED_LIBNAME, PREFIX and SUFFIX --- cmake/system.cmake | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index bc87f7b44..95f34e9f8 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -501,10 +501,11 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_TH endif() endif() endif() + +set(LIBPREFIX "lib${LIBNAMEPREFIX}openblas") + if (DEFINED LIBNAMESUFFIX) - set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") -else () - set(LIBPREFIX "libopenblas") + set(LIBPREFIX "${LIBNAMEPREFIX}_${LIBNAMESUFFIX}") endif () if (NOT DEFINED SYMBOLPREFIX) @@ -679,6 +680,10 @@ else () endif () endif () +if (DEFINED FIXED_LIBNAME) + set (LIBNAME "${LIBPREFIX}.${LIBSUFFIX}") + set (LIBNAME "${LIBPREFIX}_p.${LIBSUFFIX}") +endif() set(LIBDLLNAME "${LIBPREFIX}.dll") set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so") From 3120f12e767f05cf59e53e903c6a2a2ca9440b07 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Feb 2024 14:16:20 +0100 Subject: [PATCH 673/718] allow for more pre- and suffixes in the name of the openblas library --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index e7236677a..d9c34fe98 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -106,7 +106,7 @@ set(ZDMDEIGTST zchkdmd.f90) macro(add_eig_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) + target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() From c90979d8ef1aa69f7993efabb1b652d96d0f286c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Feb 2024 14:17:11 +0100 Subject: [PATCH 674/718] allow for more pre- and suffixes in the name of the openblas library --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 143fd0597..95baa3122 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -239,7 +239,7 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) + target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() From ca6b4961e4a5d2804372769f7d04cf5d7d86968e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Feb 2024 14:31:11 +0100 Subject: [PATCH 675/718] updates to fix option conflicts and config file generation --- Makefile.install | 4 +++- Makefile.system | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile.install b/Makefile.install index 2b00db4be..b2bc5aa41 100644 --- a/Makefile.install +++ b/Makefile.install @@ -7,6 +7,8 @@ LNCMD = ln -fs ifdef THELIBNAME LIBNAME=$(THELIBNAME) LIBSONAME=$(THELIBSONAME) +endif +ifeq ($(FIXED_LIBNAME), 1) LNCMD = true endif ifeq ($(INTERFACE64),1) @@ -190,7 +192,7 @@ endif ifneq ($(NO_SHARED),1) #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(LIBNAMESUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" diff --git a/Makefile.system b/Makefile.system index 233663ef9..2be7024c8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1523,7 +1523,7 @@ endif ifndef LIBNAMESUFFIX LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) else -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) +LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)$(LIBNAMESUFFIX) endif ifeq ($(OSNAME), CYGWIN_NT) From ba17758c02134acb327a4b71202be6be15e36dbd Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Fri, 16 Feb 2024 15:58:02 +0000 Subject: [PATCH 676/718] fix axpy implementations where y has a stride of 0 --- kernel/riscv64/axpy_rvv.c | 26 ++++++++++++++++++++++++-- kernel/riscv64/axpy_vector.c | 24 +++++++++++++++++++++++- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c index 8bc2f30de..2d5293f76 100644 --- a/kernel/riscv64/axpy_rvv.c +++ b/kernel/riscv64/axpy_rvv.c @@ -30,19 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define VSETVL(n) __riscv_vsetvl_e32m8(n) #define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_M1_T vfloat32m1_t #define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VSEV_FLOAT_M1 __riscv_vse32_v_f32m1 #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 #else #define VSETVL(n) __riscv_vsetvl_e64m8(n) #define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_M1_T vfloat64m1_t #define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VSEV_FLOAT_M1 __riscv_vse64_v_f64m1 #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -76,7 +86,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS VSEV_FLOAT(y, vy, vl); } - } else if (1 == inc_x) { + } else if (1 == inc_x && 0 != inc_y) { BLASLONG stride_y = inc_y * sizeof(FLOAT); @@ -89,8 +99,20 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS VSSEV_FLOAT(y, stride_y, vy, vl); } - } else { + } else if( 0 == inc_y ) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + size_t in_vl = VSETVL(n); + vy = VFMVVF_FLOAT( y[0], in_vl ); + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + vx = VLSEV_FLOAT(x, stride_x, vl); + vy = VFMACCVF_FLOAT(vy, da, vx, vl); + } + FLOAT_V_M1_T vres = VFMVVF_FLOAT_M1( 0.0f, 1 ); + vres = VFREDSUMVS_FLOAT( vy, vres, in_vl ); + VSEV_FLOAT_M1(y, vres, 1); + } else { BLASLONG stride_x = inc_x * sizeof(FLOAT); BLASLONG stride_y = inc_y * sizeof(FLOAT); diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c index 6dffe5f09..c77a18afa 100644 --- a/kernel/riscv64/axpy_vector.c +++ b/kernel/riscv64/axpy_vector.c @@ -51,11 +51,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) +#define FLOAT_V_M1_T JOIN(vfloat, ELEN, m1, _t, _) #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) #define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) +#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) + +#ifdef RISCV_0p10_INTRINSICS +#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) +#else +#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) +#endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { @@ -123,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS VSEV_FLOAT(&y[j], vy0, gvl); j += gvl; } - }else if(inc_x == 1){ + } else if (1 == inc_x && 0 != inc_y) { stride_y = inc_y * sizeof(FLOAT); gvl = VSETVL(n); if(gvl <= n/2){ @@ -151,6 +160,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); j += gvl; } + } else if( 0 == inc_y ) { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + size_t in_vl = VSETVL(n); + vy0 = VFMVVF_FLOAT( y[0], in_vl ); + + for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { + vl = VSETVL(n); + vx0 = VLSEV_FLOAT(x, stride_x, vl); + vy0 = VFMACCVF_FLOAT(vy0, da, vx0, vl); + } + FLOAT_V_M1_T v_res = VFMVVF_FLOAT_M1( 0.0f, 1 ); + v_res = VFREDSUMVS_FLOAT( vy0, v_res, in_vl ); + y[0] = EXTRACT_FLOAT(v_res); }else{ stride_x = inc_x * sizeof(FLOAT); stride_y = inc_y * sizeof(FLOAT); From 461ecabb2249fd598b325a91d2b4dfccbc90a824 Mon Sep 17 00:00:00 2001 From: Sergei Lewis Date: Fri, 16 Feb 2024 11:33:28 +0000 Subject: [PATCH 677/718] add RISCV64_ZVL128B and RISCV64_ZVL256B targets to CI flows and to README.md --- .github/workflows/riscv64_vector.yml | 253 +++++++++++++++++++++++++++ README.md | 10 ++ common_riscv64.h | 2 +- 3 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/riscv64_vector.yml diff --git a/.github/workflows/riscv64_vector.yml b/.github/workflows/riscv64_vector.yml new file mode 100644 index 000000000..dd6fe9ca8 --- /dev/null +++ b/.github/workflows/riscv64_vector.yml @@ -0,0 +1,253 @@ +name: riscv64 zvl256b qemu test + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + TEST: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: ubuntu-latest + env: + triple: riscv64-unknown-linux-gnu + riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain + riscv_gnu_toolchain_version: 13.2.0 + riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz + strategy: + fail-fast: false + matrix: + include: + - target: RISCV64_ZVL128B + opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64 + qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64 + - target: RISCV64_ZVL256B + opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64 + qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: install build deps + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make \ + libgomp1-riscv64-cross ccache + wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path} + tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: build OpenBLAS libs + run: | + export PATH="/opt/riscv/bin:$PATH" + make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ + CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \ + AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ + RANLIB='ccache ${triple}-ranlib' \ + FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ + HOSTCC=gcc HOSTFC=gfortran -j$(nproc) + + - name: build OpenBLAS tests + run: | + export PATH="/opt/riscv/bin:$PATH" + make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ + CC='${triple}-gcc' \ + AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ + RANLIB='ccache ${triple}-ranlib' \ + FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ + HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests + + - name: build lapack-netlib tests + working-directory: ./lapack-netlib/TESTING + run: | + export PATH="/opt/riscv/bin:$PATH" + make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \ + CC='${triple}-gcc' \ + AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \ + RANLIB='ccache ${triple}-ranlib' \ + FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \ + HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \ + LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \ + LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \ + LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \ + + - name: OpenBLAS tests + shell: bash + run: | + export PATH="/opt/riscv/bin:$PATH" + export QEMU_CPU=${{ matrix.qemu_cpu }} + rm -rf ./test_out + mkdir -p ./test_out + run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \ + echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \ + if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \ + else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \ + RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \ + } + run_test test cblat1 & + run_test test cblat2 cblat2.dat & + run_test test cblat3 cblat3.dat & + run_test test dblat1 & + run_test test dblat2 dblat2.dat & + run_test test dblat3 dblat3.dat & + run_test test sblat1 & + run_test test sblat2 sblat2.dat & + run_test test sblat3 sblat3.dat & + run_test test zblat1 & + run_test test zblat2 zblat2.dat & + run_test test zblat3 zblat3.dat & + run_test ctest xccblat1 & + run_test ctest xccblat2 cin2 & + run_test ctest xccblat3 cin3 & + run_test ctest xdcblat1 & + run_test ctest xdcblat2 din2 & + run_test ctest xdcblat3 din3 & + run_test ctest xscblat1 & + run_test ctest xscblat2 sin2 & + run_test ctest xscblat3 sin3 & + run_test ctest xzcblat1 & + run_test ctest xzcblat2 zin2 & + run_test ctest xzcblat3 zin3 & + wait + while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) + if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi + + - name: netlib tests + shell: bash + run: | + : # these take a very long time + echo "Skipping netlib tests in CI" + exit 0 + : # comment out exit above to enable the tests + : # probably we want to identify a subset to run in CI + export PATH="/opt/riscv/bin:$PATH" + export QEMU_CPU=${{ matrix.qemu_cpu }} + rm -rf ./test_out + mkdir -p ./test_out + run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \ + echo "$4" >> $OUTPUT; \ + echo "$CMD" >> $OUTPUT; \ + qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \ + RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \ + if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \ + if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \ + } + run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" & + run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" & + run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" & + run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" & + run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" & + run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" & + run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" & + run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" & + run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" & + run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" & + run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" & + run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" & + run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" & + run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" & + run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" & + run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" & + run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" & + run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" & + run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" & + run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" & + run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" & + run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" & + run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" & + run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" & + run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" & + run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" & + run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" & + run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" & + run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" & + run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" & + run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" & + run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" & + run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" & + run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" & + run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" & + run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" & + run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" & + run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" & + run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" & + run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" & + run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" & + run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" & + run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" & + run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" & + run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" & + run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" & + run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" & + run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" & + run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" & + run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" & + run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" & + run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" & + run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" & + run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" & + run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" & + run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" & + run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" & + run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" & + run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" & + run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" & + run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" & + run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" & + run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" & + run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" & + run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" & + run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" & + run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" & + run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" & + run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" & + run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" & + run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" & + wait + while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*) + python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary + TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)" + NUMERICAL_ERRORS=-1 + OTHER_ERRORS=-1 + . <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary + if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi + if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi diff --git a/README.md b/README.md index 2f0a0da4c..43f390db0 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran ``` +- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available. +e.g.: + ```sh +make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \ + BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \ + AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \ + LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \ + HOSTCC=gcc HOSTFC=gfortran -j + ``` + ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. diff --git a/common_riscv64.h b/common_riscv64.h index ab3bfa25a..eccfc644f 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define BUFFER_SIZE ( 32 << 20) #define SEEK_ADDRESS -#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) +#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280) # include #endif From 4787a55c64a17d80020c370c3e439f362979c83e Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 20 Feb 2024 20:41:45 +0800 Subject: [PATCH 678/718] Optimized cgemm kernel 16x4 LASX for LoongArch --- kernel/generic/zhemm_ltcopy_16.c | 1170 ++++++ kernel/generic/zhemm_utcopy_16.c | 1168 ++++++ kernel/generic/zneg_tcopy_16.c | 587 +++ kernel/generic/zsymm_lcopy_16.c | 333 ++ kernel/generic/zsymm_ucopy_16.c | 332 ++ kernel/generic/ztrmm_lncopy_16.c | 2310 ++++++++++++ kernel/generic/ztrmm_ltcopy_16.c | 2313 ++++++++++++ kernel/generic/ztrmm_uncopy_16.c | 2316 ++++++++++++ kernel/generic/ztrmm_utcopy_16.c | 2318 ++++++++++++ kernel/generic/ztrsm_lncopy_16.c | 308 ++ kernel/generic/ztrsm_ltcopy_16.c | 264 ++ kernel/generic/ztrsm_uncopy_16.c | 313 ++ kernel/generic/ztrsm_utcopy_16.c | 261 ++ kernel/loongarch64/KERNEL.LOONGSON3R5 | 10 +- kernel/loongarch64/cgemm_kernel_16x4_lasx.S | 3757 +++++++++++++++++++ kernel/loongarch64/cgemm_ncopy_16_lasx.S | 691 ++++ kernel/loongarch64/cgemm_ncopy_4_lasx.S | 325 ++ kernel/loongarch64/cgemm_tcopy_16_lasx.S | 741 ++++ kernel/loongarch64/cgemm_tcopy_4_lasx.S | 306 ++ param.h | 12 +- 20 files changed, 19828 insertions(+), 7 deletions(-) create mode 100644 kernel/generic/zhemm_ltcopy_16.c create mode 100644 kernel/generic/zhemm_utcopy_16.c create mode 100644 kernel/generic/zneg_tcopy_16.c create mode 100644 kernel/generic/zsymm_lcopy_16.c create mode 100644 kernel/generic/zsymm_ucopy_16.c create mode 100644 kernel/generic/ztrmm_lncopy_16.c create mode 100644 kernel/generic/ztrmm_ltcopy_16.c create mode 100644 kernel/generic/ztrmm_uncopy_16.c create mode 100644 kernel/generic/ztrmm_utcopy_16.c create mode 100644 kernel/generic/ztrsm_lncopy_16.c create mode 100644 kernel/generic/ztrsm_ltcopy_16.c create mode 100644 kernel/generic/ztrsm_uncopy_16.c create mode 100644 kernel/generic/ztrsm_utcopy_16.c create mode 100644 kernel/loongarch64/cgemm_kernel_16x4_lasx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_16_lasx.S create mode 100644 kernel/loongarch64/cgemm_ncopy_4_lasx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_16_lasx.S create mode 100644 kernel/loongarch64/cgemm_tcopy_4_lasx.S diff --git a/kernel/generic/zhemm_ltcopy_16.c b/kernel/generic/zhemm_ltcopy_16.c new file mode 100644 index 000000000..8797891ea --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_16.c @@ -0,0 +1,1170 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda; + if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda; + if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda; + if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda; + if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda; + if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda; + if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda; + if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + if (offset > -8) ao9 += lda; else ao9 += 2; + if (offset > -9) ao10 += lda; else ao10 += 2; + if (offset > -10) ao11 += lda; else ao11 += 2; + if (offset > -11) ao12 += lda; else ao12 += 2; + if (offset > -12) ao13 += lda; else ao13 += 2; + if (offset > -13) ao14 += lda; else ao14 += 2; + if (offset > -14) ao15 += lda; else ao15 += 2; + if (offset > -15) ao16 += lda; else ao16 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + } else + if (offset < -15) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -4 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -5 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -6 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -7 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = ZERO; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -8 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = ZERO; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -9 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = ZERO; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -10 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = ZERO; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -11 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = ZERO; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -12 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = ZERO; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -13 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = ZERO; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -14 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = ZERO; + b[30] = data31; + b[31] = data32; + break; + case -15 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = ZERO; + break; + } + } + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_utcopy_16.c b/kernel/generic/zhemm_utcopy_16.c new file mode 100644 index 000000000..822483a83 --- /dev/null +++ b/kernel/generic/zhemm_utcopy_16.c @@ -0,0 +1,1168 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda; + if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda; + if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda; + if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda; + if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda; + if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda; + if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda; + if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + if (offset > -8) ao9 += 2; else ao9 += lda; + if (offset > -9) ao10 += 2; else ao10 += lda; + if (offset > -10) ao11 += 2; else ao11 += lda; + if (offset > -11) ao12 += 2; else ao12 += lda; + if (offset > -12) ao13 += 2; else ao13 += lda; + if (offset > -13) ao14 += 2; else ao14 += lda; + if (offset > -14) ao15 += 2; else ao15 += lda; + if (offset > -15) ao16 += 2; else ao16 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + } else + if (offset < -15) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -4 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -5 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -6 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -7 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ZERO; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -8 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = ZERO; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -9 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = ZERO; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -10 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = ZERO; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -11 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = ZERO; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -12 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = ZERO; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -13 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = ZERO; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -14 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = ZERO; + b[30] = data31; + b[31] = -data32; + break; + case -15 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = ZERO; + break; + } + } + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = -data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zneg_tcopy_16.c b/kernel/generic/zneg_tcopy_16.c new file mode 100644 index 000000000..50f5a3d37 --- /dev/null +++ b/kernel/generic/zneg_tcopy_16.c @@ -0,0 +1,587 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + aoffset = a; + boffset = b; + lda *= 2; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 32; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + ctemp17 = *(aoffset1 + 16); + ctemp18 = *(aoffset1 + 17); + ctemp19 = *(aoffset1 + 18); + ctemp20 = *(aoffset1 + 19); + ctemp21 = *(aoffset1 + 20); + ctemp22 = *(aoffset1 + 21); + ctemp23 = *(aoffset1 + 22); + ctemp24 = *(aoffset1 + 23); + ctemp25 = *(aoffset1 + 24); + ctemp26 = *(aoffset1 + 25); + ctemp27 = *(aoffset1 + 26); + ctemp28 = *(aoffset1 + 27); + ctemp29 = *(aoffset1 + 28); + ctemp30 = *(aoffset1 + 29); + ctemp31 = *(aoffset1 + 30); + ctemp32 = *(aoffset1 + 31); + + ctemp33 = *(aoffset2 + 0); + ctemp34 = *(aoffset2 + 1); + ctemp35 = *(aoffset2 + 2); + ctemp36 = *(aoffset2 + 3); + ctemp37 = *(aoffset2 + 4); + ctemp38 = *(aoffset2 + 5); + ctemp39 = *(aoffset2 + 6); + ctemp40 = *(aoffset2 + 7); + ctemp41 = *(aoffset2 + 8); + ctemp42 = *(aoffset2 + 9); + ctemp43 = *(aoffset2 + 10); + ctemp44 = *(aoffset2 + 11); + ctemp45 = *(aoffset2 + 12); + ctemp46 = *(aoffset2 + 13); + ctemp47 = *(aoffset2 + 14); + ctemp48 = *(aoffset2 + 15); + ctemp49 = *(aoffset2 + 16); + ctemp50 = *(aoffset2 + 17); + ctemp51 = *(aoffset2 + 18); + ctemp52 = *(aoffset2 + 19); + ctemp53 = *(aoffset2 + 20); + ctemp54 = *(aoffset2 + 21); + ctemp55 = *(aoffset2 + 22); + ctemp56 = *(aoffset2 + 23); + ctemp57 = *(aoffset2 + 24); + ctemp58 = *(aoffset2 + 25); + ctemp59 = *(aoffset2 + 26); + ctemp60 = *(aoffset2 + 27); + ctemp61 = *(aoffset2 + 28); + ctemp62 = *(aoffset2 + 29); + ctemp63 = *(aoffset2 + 30); + ctemp64 = *(aoffset2 + 31); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + *(boffset + 32) = -ctemp33; + *(boffset + 33) = -ctemp34; + *(boffset + 34) = -ctemp35; + *(boffset + 35) = -ctemp36; + *(boffset + 36) = -ctemp37; + *(boffset + 37) = -ctemp38; + *(boffset + 38) = -ctemp39; + *(boffset + 39) = -ctemp40; + + *(boffset + 40) = -ctemp41; + *(boffset + 41) = -ctemp42; + *(boffset + 42) = -ctemp43; + *(boffset + 43) = -ctemp44; + *(boffset + 44) = -ctemp45; + *(boffset + 45) = -ctemp46; + *(boffset + 46) = -ctemp47; + *(boffset + 47) = -ctemp48; + + *(boffset + 48) = -ctemp49; + *(boffset + 49) = -ctemp50; + *(boffset + 50) = -ctemp51; + *(boffset + 51) = -ctemp52; + *(boffset + 52) = -ctemp53; + *(boffset + 53) = -ctemp54; + *(boffset + 54) = -ctemp55; + *(boffset + 55) = -ctemp56; + + *(boffset + 56) = -ctemp57; + *(boffset + 57) = -ctemp58; + *(boffset + 58) = -ctemp59; + *(boffset + 59) = -ctemp60; + *(boffset + 60) = -ctemp61; + *(boffset + 61) = -ctemp62; + *(boffset + 62) = -ctemp63; + *(boffset + 63) = -ctemp64; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 64; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + ctemp17 = *(aoffset1 + 16); + ctemp18 = *(aoffset1 + 17); + ctemp19 = *(aoffset1 + 18); + ctemp20 = *(aoffset1 + 19); + ctemp21 = *(aoffset1 + 20); + ctemp22 = *(aoffset1 + 21); + ctemp23 = *(aoffset1 + 22); + ctemp24 = *(aoffset1 + 23); + ctemp25 = *(aoffset1 + 24); + ctemp26 = *(aoffset1 + 25); + ctemp27 = *(aoffset1 + 26); + ctemp28 = *(aoffset1 + 27); + ctemp29 = *(aoffset1 + 28); + ctemp30 = *(aoffset1 + 29); + ctemp31 = *(aoffset1 + 30); + ctemp32 = *(aoffset1 + 31); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + boffset += 32; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + boffset += 16; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + boffset += 8; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + boffset += 4; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + // aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + // boffset += 2; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_16.c b/kernel/generic/zsymm_lcopy_16.c new file mode 100644 index 000000000..b32374a5e --- /dev/null +++ b/kernel/generic/zsymm_lcopy_16.c @@ -0,0 +1,333 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda; + if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda; + if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda; + if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda; + if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda; + if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda; + if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda; + if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda; + + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + if (offset > -8) ao9 += lda; else ao9 += 2; + if (offset > -9) ao10 += lda; else ao10 += 2; + if (offset > -10) ao11 += lda; else ao11 += 2; + if (offset > -11) ao12 += lda; else ao12 += 2; + if (offset > -12) ao13 += lda; else ao13 += 2; + if (offset > -13) ao14 += lda; else ao14 += 2; + if (offset > -14) ao15 += lda; else ao15 += 2; + if (offset > -15) ao16 += lda; else ao16 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_16.c b/kernel/generic/zsymm_ucopy_16.c new file mode 100644 index 000000000..cb19bea47 --- /dev/null +++ b/kernel/generic/zsymm_ucopy_16.c @@ -0,0 +1,332 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda; + if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda; + if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda; + if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda; + if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda; + if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda; + if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda; + if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + if (offset > -8) ao9 += 2; else ao9 += lda; + if (offset > -9) ao10 += 2; else ao10 += lda; + if (offset > -10) ao11 += 2; else ao11 += lda; + if (offset > -11) ao12 += 2; else ao12 += lda; + if (offset > -12) ao13 += 2; else ao13 += lda; + if (offset > -13) ao14 += 2; else ao14 += lda; + if (offset > -14) ao15 += 2; else ao15 += lda; + if (offset > -15) ao16 += 2; else ao16 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_16.c b/kernel/generic/ztrmm_lncopy_16.c new file mode 100644 index 000000000..d7fb23176 --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_16.c @@ -0,0 +1,2310 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X < posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 512; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 2); + b[ 33] = *(a01 + 3); +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a01 + 4); + b[ 65] = *(a01 + 5); + b[ 66] = *(a02 + 4); + b[ 67] = *(a02 + 5); +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a01 + 6); + b[ 97] = *(a01 + 7); + b[ 98] = *(a02 + 6); + b[ 99] = *(a02 + 7); + b[100] = *(a03 + 6); + b[101] = *(a03 + 7); +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a01 + 8); + b[129] = *(a01 + 9); + b[130] = *(a02 + 8); + b[131] = *(a02 + 9); + b[132] = *(a03 + 8); + b[133] = *(a03 + 9); + b[134] = *(a04 + 8); + b[135] = *(a04 + 9); +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; + b[153] = ZERO; + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a01 + 10); + b[161] = *(a01 + 11); + b[162] = *(a02 + 10); + b[163] = *(a02 + 11); + b[164] = *(a03 + 10); + b[165] = *(a03 + 11); + b[166] = *(a04 + 10); + b[167] = *(a04 + 11); + b[168] = *(a05 + 10); + b[169] = *(a05 + 11); +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; + b[187] = ZERO; + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a01 + 12); + b[193] = *(a01 + 13); + b[194] = *(a02 + 12); + b[195] = *(a02 + 13); + b[196] = *(a03 + 12); + b[197] = *(a03 + 13); + b[198] = *(a04 + 12); + b[199] = *(a04 + 13); + b[200] = *(a05 + 12); + b[201] = *(a05 + 13); + b[202] = *(a06 + 12); + b[203] = *(a06 + 13); +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = ZERO; + b[207] = ZERO; + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; + b[221] = ZERO; + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a01 + 14); + b[225] = *(a01 + 15); + b[226] = *(a02 + 14); + b[227] = *(a02 + 15); + b[228] = *(a03 + 14); + b[229] = *(a03 + 15); + b[230] = *(a04 + 14); + b[231] = *(a04 + 15); + b[232] = *(a05 + 14); + b[233] = *(a05 + 15); + b[234] = *(a06 + 14); + b[235] = *(a06 + 15); + b[236] = *(a07 + 14); + b[237] = *(a07 + 15); +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; + b[255] = ZERO; + + b[256] = *(a01 + 16); + b[257] = *(a01 + 17); + b[258] = *(a02 + 16); + b[259] = *(a02 + 17); + b[260] = *(a03 + 16); + b[261] = *(a03 + 17); + b[262] = *(a04 + 16); + b[263] = *(a04 + 17); + b[264] = *(a05 + 16); + b[265] = *(a05 + 17); + b[266] = *(a06 + 16); + b[267] = *(a06 + 17); + b[268] = *(a07 + 16); + b[269] = *(a07 + 17); + b[270] = *(a08 + 16); + b[271] = *(a08 + 17); +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = ZERO; + b[275] = ZERO; + b[276] = ZERO; + b[277] = ZERO; + b[278] = ZERO; + b[279] = ZERO; + b[280] = ZERO; + b[281] = ZERO; + b[282] = ZERO; + b[283] = ZERO; + b[284] = ZERO; + b[285] = ZERO; + b[286] = ZERO; + b[287] = ZERO; + + b[288] = *(a01 + 18); + b[289] = *(a01 + 19); + b[290] = *(a02 + 18); + b[291] = *(a02 + 19); + b[292] = *(a03 + 18); + b[293] = *(a03 + 19); + b[294] = *(a04 + 18); + b[295] = *(a04 + 19); + b[296] = *(a05 + 18); + b[297] = *(a05 + 19); + b[298] = *(a06 + 18); + b[299] = *(a06 + 19); + b[300] = *(a07 + 18); + b[301] = *(a07 + 19); + b[302] = *(a08 + 18); + b[303] = *(a08 + 19); + b[304] = *(a09 + 18); + b[305] = *(a09 + 19); +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = ZERO; + b[309] = ZERO; + b[310] = ZERO; + b[311] = ZERO; + b[312] = ZERO; + b[313] = ZERO; + b[314] = ZERO; + b[315] = ZERO; + b[316] = ZERO; + b[317] = ZERO; + b[318] = ZERO; + b[319] = ZERO; + + b[320] = *(a01 + 20); + b[321] = *(a01 + 21); + b[322] = *(a02 + 20); + b[323] = *(a02 + 21); + b[324] = *(a03 + 20); + b[325] = *(a03 + 21); + b[326] = *(a04 + 20); + b[327] = *(a04 + 21); + b[328] = *(a05 + 20); + b[329] = *(a05 + 21); + b[330] = *(a06 + 20); + b[331] = *(a06 + 21); + b[332] = *(a07 + 20); + b[333] = *(a07 + 21); + b[334] = *(a08 + 20); + b[335] = *(a08 + 21); + b[336] = *(a09 + 20); + b[337] = *(a09 + 21); + b[338] = *(a10 + 20); + b[339] = *(a10 + 21); +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = ZERO; + b[343] = ZERO; + b[344] = ZERO; + b[345] = ZERO; + b[346] = ZERO; + b[347] = ZERO; + b[348] = ZERO; + b[349] = ZERO; + b[350] = ZERO; + b[351] = ZERO; + + b[352] = *(a01 + 22); + b[353] = *(a01 + 23); + b[354] = *(a02 + 22); + b[355] = *(a02 + 23); + b[356] = *(a03 + 22); + b[357] = *(a03 + 23); + b[358] = *(a04 + 22); + b[359] = *(a04 + 23); + b[360] = *(a05 + 22); + b[361] = *(a05 + 23); + b[362] = *(a06 + 22); + b[363] = *(a06 + 23); + b[364] = *(a07 + 22); + b[365] = *(a07 + 23); + b[366] = *(a08 + 22); + b[367] = *(a08 + 23); + b[368] = *(a09 + 22); + b[369] = *(a09 + 23); + b[370] = *(a10 + 22); + b[371] = *(a10 + 23); + b[372] = *(a11 + 22); + b[373] = *(a11 + 23); +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = ZERO; + b[377] = ZERO; + b[378] = ZERO; + b[379] = ZERO; + b[380] = ZERO; + b[381] = ZERO; + b[382] = ZERO; + b[383] = ZERO; + + b[384] = *(a01 + 24); + b[385] = *(a01 + 25); + b[386] = *(a02 + 24); + b[387] = *(a02 + 25); + b[388] = *(a03 + 24); + b[389] = *(a03 + 25); + b[390] = *(a04 + 24); + b[391] = *(a04 + 25); + b[392] = *(a05 + 24); + b[393] = *(a05 + 25); + b[394] = *(a06 + 24); + b[395] = *(a06 + 25); + b[396] = *(a07 + 24); + b[397] = *(a07 + 25); + b[398] = *(a08 + 24); + b[399] = *(a08 + 25); + b[400] = *(a09 + 24); + b[401] = *(a09 + 25); + b[402] = *(a10 + 24); + b[403] = *(a10 + 25); + b[404] = *(a11 + 24); + b[405] = *(a11 + 25); + b[406] = *(a12 + 24); + b[407] = *(a12 + 25); +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = ZERO; + b[411] = ZERO; + b[412] = ZERO; + b[413] = ZERO; + b[414] = ZERO; + b[415] = ZERO; + + b[416] = *(a01 + 26); + b[417] = *(a01 + 27); + b[418] = *(a02 + 26); + b[419] = *(a02 + 27); + b[420] = *(a03 + 26); + b[421] = *(a03 + 27); + b[422] = *(a04 + 26); + b[423] = *(a04 + 27); + b[424] = *(a05 + 26); + b[425] = *(a05 + 27); + b[426] = *(a06 + 26); + b[427] = *(a06 + 27); + b[428] = *(a07 + 26); + b[429] = *(a07 + 27); + b[430] = *(a08 + 26); + b[431] = *(a08 + 27); + b[432] = *(a09 + 26); + b[433] = *(a09 + 27); + b[434] = *(a10 + 26); + b[435] = *(a10 + 27); + b[436] = *(a11 + 26); + b[437] = *(a11 + 27); + b[438] = *(a12 + 26); + b[439] = *(a12 + 27); + b[440] = *(a13 + 26); + b[441] = *(a13 + 27); +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = ZERO; + b[445] = ZERO; + b[446] = ZERO; + b[447] = ZERO; + + b[448] = *(a01 + 28); + b[449] = *(a01 + 29); + b[450] = *(a02 + 28); + b[451] = *(a02 + 29); + b[452] = *(a03 + 28); + b[453] = *(a03 + 29); + b[454] = *(a04 + 28); + b[455] = *(a04 + 29); + b[456] = *(a05 + 28); + b[457] = *(a05 + 29); + b[458] = *(a06 + 28); + b[459] = *(a06 + 29); + b[460] = *(a07 + 28); + b[461] = *(a07 + 29); + b[462] = *(a08 + 28); + b[463] = *(a08 + 29); + b[464] = *(a09 + 28); + b[465] = *(a09 + 29); + b[466] = *(a10 + 28); + b[467] = *(a10 + 29); + b[468] = *(a11 + 28); + b[469] = *(a11 + 29); + b[470] = *(a12 + 28); + b[471] = *(a12 + 29); + b[472] = *(a13 + 28); + b[473] = *(a13 + 29); + b[474] = *(a14 + 28); + b[475] = *(a14 + 29); +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = ZERO; + b[479] = ZERO; + + b[480] = *(a01 + 30); + b[481] = *(a01 + 31); + b[482] = *(a02 + 30); + b[483] = *(a02 + 31); + b[484] = *(a03 + 30); + b[485] = *(a03 + 31); + b[486] = *(a04 + 30); + b[487] = *(a04 + 31); + b[488] = *(a05 + 30); + b[489] = *(a05 + 31); + b[490] = *(a06 + 30); + b[491] = *(a06 + 31); + b[492] = *(a07 + 30); + b[493] = *(a07 + 31); + b[494] = *(a08 + 30); + b[495] = *(a08 + 31); + b[496] = *(a09 + 30); + b[497] = *(a09 + 31); + b[498] = *(a10 + 30); + b[499] = *(a10 + 31); + b[500] = *(a11 + 30); + b[501] = *(a11 + 31); + b[502] = *(a12 + 30); + b[503] = *(a12 + 31); + b[504] = *(a13 + 30); + b[505] = *(a13 + 31); + b[506] = *(a14 + 30); + b[507] = *(a14 + 31); + b[508] = *(a15 + 30); + b[509] = *(a15 + 31); +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 4) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a01 + 7); + b[ 2] = *(a02 + 6); + b[ 3] = *(a02 + 7); + b[ 4] = *(a03 + 6); + b[ 5] = *(a03 + 7); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 5) { + b[ 0] = *(a01 + 8); + b[ 1] = *(a01 + 9); + b[ 2] = *(a02 + 8); + b[ 3] = *(a02 + 9); + b[ 4] = *(a03 + 8); + b[ 5] = *(a03 + 9); + b[ 6] = *(a04 + 8); + b[ 7] = *(a04 + 9); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 6) { + b[ 0] = *(a01 + 10); + b[ 1] = *(a01 + 11); + b[ 2] = *(a02 + 10); + b[ 3] = *(a02 + 11); + b[ 4] = *(a03 + 10); + b[ 5] = *(a03 + 11); + b[ 6] = *(a04 + 10); + b[ 7] = *(a04 + 11); + b[ 8] = *(a05 + 10); + b[ 9] = *(a05 + 11); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 7) { + b[ 0] = *(a01 + 12); + b[ 1] = *(a01 + 13); + b[ 2] = *(a02 + 12); + b[ 3] = *(a02 + 13); + b[ 4] = *(a03 + 12); + b[ 5] = *(a03 + 13); + b[ 6] = *(a04 + 12); + b[ 7] = *(a04 + 13); + b[ 8] = *(a05 + 12); + b[ 9] = *(a05 + 13); + b[10] = *(a06 + 12); + b[11] = *(a06 + 13); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 8) { + b[ 0] = *(a01 + 14); + b[ 1] = *(a01 + 15); + b[ 2] = *(a02 + 14); + b[ 3] = *(a02 + 15); + b[ 4] = *(a03 + 14); + b[ 5] = *(a03 + 15); + b[ 6] = *(a04 + 14); + b[ 7] = *(a04 + 15); + b[ 8] = *(a05 + 14); + b[ 9] = *(a05 + 15); + b[ 10] = *(a06 + 14); + b[ 11] = *(a06 + 15); + b[ 12] = *(a07 + 14); + b[ 13] = *(a07 + 15); +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 9) { + b[ 0] = *(a01 + 16); + b[ 1] = *(a01 + 17); + b[ 2] = *(a02 + 16); + b[ 3] = *(a02 + 17); + b[ 4] = *(a03 + 16); + b[ 5] = *(a03 + 17); + b[ 6] = *(a04 + 16); + b[ 7] = *(a04 + 17); + b[ 8] = *(a05 + 16); + b[ 9] = *(a05 + 17); + b[ 10] = *(a06 + 16); + b[ 11] = *(a06 + 17); + b[ 12] = *(a07 + 16); + b[ 13] = *(a07 + 17); + b[ 14] = *(a08 + 16); + b[ 15] = *(a08 + 17); +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 10) { + b[ 0] = *(a01 + 18); + b[ 1] = *(a01 + 19); + b[ 2] = *(a02 + 18); + b[ 3] = *(a02 + 19); + b[ 4] = *(a03 + 18); + b[ 5] = *(a03 + 19); + b[ 6] = *(a04 + 18); + b[ 7] = *(a04 + 19); + b[ 8] = *(a05 + 18); + b[ 9] = *(a05 + 19); + b[ 10] = *(a06 + 18); + b[ 11] = *(a06 + 19); + b[ 12] = *(a07 + 18); + b[ 13] = *(a07 + 19); + b[ 14] = *(a08 + 18); + b[ 15] = *(a08 + 19); + b[ 16] = *(a09 + 18); + b[ 17] = *(a09 + 19); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 11) { + b[ 0] = *(a01 + 20); + b[ 1] = *(a01 + 21); + b[ 2] = *(a02 + 20); + b[ 3] = *(a02 + 21); + b[ 4] = *(a03 + 20); + b[ 5] = *(a03 + 21); + b[ 6] = *(a04 + 20); + b[ 7] = *(a04 + 21); + b[ 8] = *(a05 + 20); + b[ 9] = *(a05 + 21); + b[ 10] = *(a06 + 20); + b[ 11] = *(a06 + 21); + b[ 12] = *(a07 + 20); + b[ 13] = *(a07 + 21); + b[ 14] = *(a08 + 20); + b[ 15] = *(a08 + 21); + b[ 16] = *(a09 + 20); + b[ 17] = *(a09 + 21); + b[ 18] = *(a10 + 20); + b[ 19] = *(a10 + 21); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 12) { + b[ 0] = *(a01 + 22); + b[ 1] = *(a01 + 23); + b[ 2] = *(a02 + 22); + b[ 3] = *(a02 + 23); + b[ 4] = *(a03 + 22); + b[ 5] = *(a03 + 23); + b[ 6] = *(a04 + 22); + b[ 7] = *(a04 + 23); + b[ 8] = *(a05 + 22); + b[ 9] = *(a05 + 23); + b[ 10] = *(a06 + 22); + b[ 11] = *(a06 + 23); + b[ 12] = *(a07 + 22); + b[ 13] = *(a07 + 23); + b[ 14] = *(a08 + 22); + b[ 15] = *(a08 + 23); + b[ 16] = *(a09 + 22); + b[ 17] = *(a09 + 23); + b[ 18] = *(a10 + 22); + b[ 19] = *(a10 + 23); + b[ 20] = *(a11 + 22); + b[ 21] = *(a11 + 23); +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 13) { + b[ 0] = *(a01 + 24); + b[ 1] = *(a01 + 25); + b[ 2] = *(a02 + 24); + b[ 3] = *(a02 + 25); + b[ 4] = *(a03 + 24); + b[ 5] = *(a03 + 25); + b[ 6] = *(a04 + 24); + b[ 7] = *(a04 + 25); + b[ 8] = *(a05 + 24); + b[ 9] = *(a05 + 25); + b[ 10] = *(a06 + 24); + b[ 11] = *(a06 + 25); + b[ 12] = *(a07 + 24); + b[ 13] = *(a07 + 25); + b[ 14] = *(a08 + 24); + b[ 15] = *(a08 + 25); + b[ 16] = *(a09 + 24); + b[ 17] = *(a09 + 25); + b[ 18] = *(a10 + 24); + b[ 19] = *(a10 + 25); + b[ 20] = *(a11 + 24); + b[ 21] = *(a11 + 25); + b[ 22] = *(a12 + 24); + b[ 23] = *(a12 + 25); +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 14) { + b[ 0] = *(a01 + 26); + b[ 1] = *(a01 + 27); + b[ 2] = *(a02 + 26); + b[ 3] = *(a02 + 27); + b[ 4] = *(a03 + 26); + b[ 5] = *(a03 + 27); + b[ 6] = *(a04 + 26); + b[ 7] = *(a04 + 27); + b[ 8] = *(a05 + 26); + b[ 9] = *(a05 + 27); + b[ 10] = *(a06 + 26); + b[ 11] = *(a06 + 27); + b[ 12] = *(a07 + 26); + b[ 13] = *(a07 + 27); + b[ 14] = *(a08 + 26); + b[ 15] = *(a08 + 27); + b[ 16] = *(a09 + 26); + b[ 17] = *(a09 + 27); + b[ 18] = *(a10 + 26); + b[ 19] = *(a10 + 27); + b[ 20] = *(a11 + 26); + b[ 21] = *(a11 + 27); + b[ 22] = *(a12 + 26); + b[ 23] = *(a12 + 27); + b[ 24] = *(a13 + 26); + b[ 25] = *(a13 + 27); +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 15) { + b[ 0] = *(a01 + 28); + b[ 1] = *(a01 + 29); + b[ 2] = *(a02 + 28); + b[ 3] = *(a02 + 29); + b[ 4] = *(a03 + 28); + b[ 5] = *(a03 + 29); + b[ 6] = *(a04 + 28); + b[ 7] = *(a04 + 29); + b[ 8] = *(a05 + 28); + b[ 9] = *(a05 + 29); + b[ 10] = *(a06 + 28); + b[ 11] = *(a06 + 29); + b[ 12] = *(a07 + 28); + b[ 13] = *(a07 + 29); + b[ 14] = *(a08 + 28); + b[ 15] = *(a08 + 29); + b[ 16] = *(a09 + 28); + b[ 17] = *(a09 + 29); + b[ 18] = *(a10 + 28); + b[ 19] = *(a10 + 29); + b[ 20] = *(a11 + 28); + b[ 21] = *(a11 + 29); + b[ 22] = *(a12 + 28); + b[ 23] = *(a12 + 29); + b[ 24] = *(a13 + 28); + b[ 25] = *(a13 + 29); + b[ 26] = *(a14 + 28); + b[ 27] = *(a14 + 29); +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X < posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + b += 128; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 2); + b[ 17] = *(a01 + 3); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 4); + b[ 33] = *(a01 + 5); + b[ 34] = *(a02 + 4); + b[ 35] = *(a02 + 5); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a01 + 6); + b[ 49] = *(a01 + 7); + b[ 50] = *(a02 + 6); + b[ 51] = *(a02 + 7); + b[ 52] = *(a03 + 6); + b[ 53] = *(a03 + 7); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a01 + 8); + b[ 65] = *(a01 + 9); + b[ 66] = *(a02 + 8); + b[ 67] = *(a02 + 9); + b[ 68] = *(a03 + 8); + b[ 69] = *(a03 + 9); + b[ 70] = *(a04 + 8); + b[ 71] = *(a04 + 9); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a01 + 10); + b[ 81] = *(a01 + 11); + b[ 82] = *(a02 + 10); + b[ 83] = *(a02 + 11); + b[ 84] = *(a03 + 10); + b[ 85] = *(a03 + 11); + b[ 86] = *(a04 + 10); + b[ 87] = *(a04 + 11); + b[ 88] = *(a05 + 10); + b[ 89] = *(a05 + 11); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a01 + 12); + b[ 97] = *(a01 + 13); + b[ 98] = *(a02 + 12); + b[ 99] = *(a02 + 13); + b[100] = *(a03 + 12); + b[101] = *(a03 + 13); + b[102] = *(a04 + 12); + b[103] = *(a04 + 13); + b[104] = *(a05 + 12); + b[105] = *(a05 + 13); + b[106] = *(a06 + 12); + b[107] = *(a06 + 13); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a01 + 14); + b[113] = *(a01 + 15); + b[114] = *(a02 + 14); + b[115] = *(a02 + 15); + b[116] = *(a03 + 14); + b[117] = *(a03 + 15); + b[118] = *(a04 + 14); + b[119] = *(a04 + 15); + b[120] = *(a05 + 14); + b[121] = *(a05 + 15); + b[122] = *(a06 + 14); + b[123] = *(a06 + 15); + b[124] = *(a07 + 14); + b[125] = *(a07 + 15); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; */ + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a01 + 7); + b[ 2] = *(a02 + 6); + b[ 3] = *(a02 + 7); + b[ 4] = *(a03 + 6); + b[ 5] = *(a03 + 7); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a01 + 8); + b[ 1] = *(a01 + 9); + b[ 2] = *(a02 + 8); + b[ 3] = *(a02 + 9); + b[ 4] = *(a03 + 8); + b[ 5] = *(a03 + 9); + b[ 6] = *(a04 + 8); + b[ 7] = *(a04 + 9); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a01 + 10); + b[ 1] = *(a01 + 11); + b[ 2] = *(a02 + 10); + b[ 3] = *(a02 + 11); + b[ 4] = *(a03 + 10); + b[ 5] = *(a03 + 11); + b[ 6] = *(a04 + 10); + b[ 7] = *(a04 + 11); + b[ 8] = *(a05 + 10); + b[ 9] = *(a05 + 11); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a01 + 12); + b[ 1] = *(a01 + 13); + b[ 2] = *(a02 + 12); + b[ 3] = *(a02 + 13); + b[ 4] = *(a03 + 12); + b[ 5] = *(a03 + 13); + b[ 6] = *(a04 + 12); + b[ 7] = *(a04 + 13); + b[ 8] = *(a05 + 12); + b[ 9] = *(a05 + 13); + b[10] = *(a06 + 12); + b[11] = *(a06 + 13); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X < posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a01 + 2); + b[ 9] = *(a01 + 3); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 4); + b[ 17] = *(a01 + 5); + b[ 18] = *(a02 + 4); + b[ 19] = *(a02 + 5); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a01 + 6); + b[ 25] = *(a01 + 7); + b[ 26] = *(a02 + 6); + b[ 27] = *(a02 + 7); + b[ 28] = *(a03 + 6); + b[ 29] = *(a03 + 7); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; */ + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 4; + a02 += 4; + b += 8; + } else + if (X < posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + a01 += 4; + a02 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X < posY) { + /* a01 += 2 * lda; + a02 += 2 * lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += 2; + b += 2; + } else + if (X < posY) { + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + a01 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_16.c b/kernel/generic/ztrmm_ltcopy_16.c new file mode 100644 index 000000000..8d585e70b --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_16.c @@ -0,0 +1,2313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } else + if (X < posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + b += 32; + } + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = *(a02 + 4); + b[ 37] = *(a02 + 5); + b[ 38] = *(a02 + 6); + b[ 39] = *(a02 + 7); + b[ 40] = *(a02 + 8); + b[ 41] = *(a02 + 9); + b[ 42] = *(a02 + 10); + b[ 43] = *(a02 + 11); + b[ 44] = *(a02 + 12); + b[ 45] = *(a02 + 13); + b[ 46] = *(a02 + 14); + b[ 47] = *(a02 + 15); + b[ 48] = *(a02 + 16); + b[ 49] = *(a02 + 17); + b[ 50] = *(a02 + 18); + b[ 51] = *(a02 + 19); + b[ 52] = *(a02 + 20); + b[ 53] = *(a02 + 21); + b[ 54] = *(a02 + 22); + b[ 55] = *(a02 + 23); + b[ 56] = *(a02 + 24); + b[ 57] = *(a02 + 25); + b[ 58] = *(a02 + 26); + b[ 59] = *(a02 + 27); + b[ 60] = *(a02 + 28); + b[ 61] = *(a02 + 29); + b[ 62] = *(a02 + 30); + b[ 63] = *(a02 + 31); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = *(a03 + 6); + b[ 71] = *(a03 + 7); + b[ 72] = *(a03 + 8); + b[ 73] = *(a03 + 9); + b[ 74] = *(a03 + 10); + b[ 75] = *(a03 + 11); + b[ 76] = *(a03 + 12); + b[ 77] = *(a03 + 13); + b[ 78] = *(a03 + 14); + b[ 79] = *(a03 + 15); + b[ 80] = *(a03 + 16); + b[ 81] = *(a03 + 17); + b[ 82] = *(a03 + 18); + b[ 83] = *(a03 + 19); + b[ 84] = *(a03 + 20); + b[ 85] = *(a03 + 21); + b[ 86] = *(a03 + 22); + b[ 87] = *(a03 + 23); + b[ 88] = *(a03 + 24); + b[ 89] = *(a03 + 25); + b[ 90] = *(a03 + 26); + b[ 91] = *(a03 + 27); + b[ 92] = *(a03 + 28); + b[ 93] = *(a03 + 29); + b[ 94] = *(a03 + 30); + b[ 95] = *(a03 + 31); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = *(a04 + 8); + b[105] = *(a04 + 9); + b[106] = *(a04 + 10); + b[107] = *(a04 + 11); + b[108] = *(a04 + 12); + b[109] = *(a04 + 13); + b[110] = *(a04 + 14); + b[111] = *(a04 + 15); + b[112] = *(a04 + 16); + b[113] = *(a04 + 17); + b[114] = *(a04 + 18); + b[115] = *(a04 + 19); + b[116] = *(a04 + 20); + b[117] = *(a04 + 21); + b[118] = *(a04 + 22); + b[119] = *(a04 + 23); + b[120] = *(a04 + 24); + b[121] = *(a04 + 25); + b[122] = *(a04 + 26); + b[123] = *(a04 + 27); + b[124] = *(a04 + 28); + b[125] = *(a04 + 29); + b[126] = *(a04 + 30); + b[127] = *(a04 + 31); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = *(a05 + 10); + b[139] = *(a05 + 11); + b[140] = *(a05 + 12); + b[141] = *(a05 + 13); + b[142] = *(a05 + 14); + b[143] = *(a05 + 15); + b[144] = *(a05 + 16); + b[145] = *(a05 + 17); + b[146] = *(a05 + 18); + b[147] = *(a05 + 19); + b[148] = *(a05 + 20); + b[149] = *(a05 + 21); + b[150] = *(a05 + 22); + b[151] = *(a05 + 23); + b[152] = *(a05 + 24); + b[153] = *(a05 + 25); + b[154] = *(a05 + 26); + b[155] = *(a05 + 27); + b[156] = *(a05 + 28); + b[157] = *(a05 + 29); + b[158] = *(a05 + 30); + b[159] = *(a05 + 31); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = *(a06 + 12); + b[173] = *(a06 + 13); + b[174] = *(a06 + 14); + b[175] = *(a06 + 15); + b[176] = *(a06 + 16); + b[177] = *(a06 + 17); + b[178] = *(a06 + 18); + b[179] = *(a06 + 19); + b[180] = *(a06 + 20); + b[181] = *(a06 + 21); + b[182] = *(a06 + 22); + b[183] = *(a06 + 23); + b[184] = *(a06 + 24); + b[185] = *(a06 + 25); + b[186] = *(a06 + 26); + b[187] = *(a06 + 27); + b[188] = *(a06 + 28); + b[189] = *(a06 + 29); + b[190] = *(a06 + 30); + b[191] = *(a06 + 31); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = *(a07 + 14); + b[207] = *(a07 + 15); + b[208] = *(a07 + 16); + b[209] = *(a07 + 17); + b[210] = *(a07 + 18); + b[211] = *(a07 + 19); + b[212] = *(a07 + 20); + b[213] = *(a07 + 21); + b[214] = *(a07 + 22); + b[215] = *(a07 + 23); + b[216] = *(a07 + 24); + b[217] = *(a07 + 25); + b[218] = *(a07 + 26); + b[219] = *(a07 + 27); + b[220] = *(a07 + 28); + b[221] = *(a07 + 29); + b[222] = *(a07 + 30); + b[223] = *(a07 + 31); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = *(a08 + 16); + b[241] = *(a08 + 17); + b[242] = *(a08 + 18); + b[243] = *(a08 + 19); + b[244] = *(a08 + 20); + b[245] = *(a08 + 21); + b[246] = *(a08 + 22); + b[247] = *(a08 + 23); + b[248] = *(a08 + 24); + b[249] = *(a08 + 25); + b[250] = *(a08 + 26); + b[251] = *(a08 + 27); + b[252] = *(a08 + 28); + b[253] = *(a08 + 29); + b[254] = *(a08 + 30); + b[255] = *(a08 + 31); + + b[256] = ZERO; + b[257] = ZERO; + b[258] = ZERO; + b[259] = ZERO; + b[260] = ZERO; + b[261] = ZERO; + b[262] = ZERO; + b[263] = ZERO; + b[264] = ZERO; + b[265] = ZERO; + b[266] = ZERO; + b[267] = ZERO; + b[268] = ZERO; + b[269] = ZERO; + b[270] = ZERO; + b[271] = ZERO; +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = *(a09 + 18); + b[275] = *(a09 + 19); + b[276] = *(a09 + 20); + b[277] = *(a09 + 21); + b[278] = *(a09 + 22); + b[279] = *(a09 + 23); + b[280] = *(a09 + 24); + b[281] = *(a09 + 25); + b[282] = *(a09 + 26); + b[283] = *(a09 + 27); + b[284] = *(a09 + 28); + b[285] = *(a09 + 29); + b[286] = *(a09 + 30); + b[287] = *(a09 + 31); + + b[288] = ZERO; + b[289] = ZERO; + b[290] = ZERO; + b[291] = ZERO; + b[292] = ZERO; + b[293] = ZERO; + b[294] = ZERO; + b[295] = ZERO; + b[296] = ZERO; + b[297] = ZERO; + b[298] = ZERO; + b[299] = ZERO; + b[300] = ZERO; + b[301] = ZERO; + b[302] = ZERO; + b[303] = ZERO; + b[304] = ZERO; + b[305] = ZERO; +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = *(a10 + 20); + b[309] = *(a10 + 21); + b[310] = *(a10 + 22); + b[311] = *(a10 + 23); + b[312] = *(a10 + 24); + b[313] = *(a10 + 25); + b[314] = *(a10 + 26); + b[315] = *(a10 + 27); + b[316] = *(a10 + 28); + b[317] = *(a10 + 29); + b[318] = *(a10 + 30); + b[319] = *(a10 + 31); + + b[320] = ZERO; + b[321] = ZERO; + b[322] = ZERO; + b[323] = ZERO; + b[324] = ZERO; + b[325] = ZERO; + b[326] = ZERO; + b[327] = ZERO; + b[328] = ZERO; + b[329] = ZERO; + b[330] = ZERO; + b[331] = ZERO; + b[332] = ZERO; + b[333] = ZERO; + b[334] = ZERO; + b[335] = ZERO; + b[336] = ZERO; + b[337] = ZERO; + b[338] = ZERO; + b[339] = ZERO; +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = *(a11 + 22); + b[343] = *(a11 + 23); + b[344] = *(a11 + 24); + b[345] = *(a11 + 25); + b[346] = *(a11 + 26); + b[347] = *(a11 + 27); + b[348] = *(a11 + 28); + b[349] = *(a11 + 29); + b[350] = *(a11 + 30); + b[351] = *(a11 + 31); + + b[352] = ZERO; + b[353] = ZERO; + b[354] = ZERO; + b[355] = ZERO; + b[356] = ZERO; + b[357] = ZERO; + b[358] = ZERO; + b[359] = ZERO; + b[360] = ZERO; + b[361] = ZERO; + b[362] = ZERO; + b[363] = ZERO; + b[364] = ZERO; + b[365] = ZERO; + b[366] = ZERO; + b[367] = ZERO; + b[368] = ZERO; + b[369] = ZERO; + b[370] = ZERO; + b[371] = ZERO; + b[372] = ZERO; + b[373] = ZERO; +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = *(a12 + 24); + b[377] = *(a12 + 25); + b[378] = *(a12 + 26); + b[379] = *(a12 + 27); + b[380] = *(a12 + 28); + b[381] = *(a12 + 29); + b[382] = *(a12 + 30); + b[383] = *(a12 + 31); + + b[384] = ZERO; + b[385] = ZERO; + b[386] = ZERO; + b[387] = ZERO; + b[388] = ZERO; + b[389] = ZERO; + b[390] = ZERO; + b[391] = ZERO; + b[392] = ZERO; + b[393] = ZERO; + b[394] = ZERO; + b[395] = ZERO; + b[396] = ZERO; + b[397] = ZERO; + b[398] = ZERO; + b[399] = ZERO; + b[400] = ZERO; + b[401] = ZERO; + b[402] = ZERO; + b[403] = ZERO; + b[404] = ZERO; + b[405] = ZERO; + b[406] = ZERO; + b[407] = ZERO; +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = *(a13 + 26); + b[411] = *(a13 + 27); + b[412] = *(a13 + 28); + b[413] = *(a13 + 29); + b[414] = *(a13 + 30); + b[415] = *(a13 + 31); + + b[416] = ZERO; + b[417] = ZERO; + b[418] = ZERO; + b[419] = ZERO; + b[420] = ZERO; + b[421] = ZERO; + b[422] = ZERO; + b[423] = ZERO; + b[424] = ZERO; + b[425] = ZERO; + b[426] = ZERO; + b[427] = ZERO; + b[428] = ZERO; + b[429] = ZERO; + b[430] = ZERO; + b[431] = ZERO; + b[432] = ZERO; + b[433] = ZERO; + b[434] = ZERO; + b[435] = ZERO; + b[436] = ZERO; + b[437] = ZERO; + b[438] = ZERO; + b[439] = ZERO; + b[440] = ZERO; + b[441] = ZERO; +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = *(a14 + 28); + b[445] = *(a14 + 29); + b[446] = *(a14 + 30); + b[447] = *(a14 + 31); + + b[448] = ZERO; + b[449] = ZERO; + b[450] = ZERO; + b[451] = ZERO; + b[452] = ZERO; + b[453] = ZERO; + b[454] = ZERO; + b[455] = ZERO; + b[456] = ZERO; + b[457] = ZERO; + b[458] = ZERO; + b[459] = ZERO; + b[460] = ZERO; + b[461] = ZERO; + b[462] = ZERO; + b[463] = ZERO; + b[464] = ZERO; + b[465] = ZERO; + b[466] = ZERO; + b[467] = ZERO; + b[468] = ZERO; + b[469] = ZERO; + b[470] = ZERO; + b[471] = ZERO; + b[472] = ZERO; + b[473] = ZERO; + b[474] = ZERO; + b[475] = ZERO; +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = *(a15 + 30); + b[479] = *(a15 + 31); + + b[480] = ZERO; + b[481] = ZERO; + b[482] = ZERO; + b[483] = ZERO; + b[484] = ZERO; + b[485] = ZERO; + b[486] = ZERO; + b[487] = ZERO; + b[488] = ZERO; + b[489] = ZERO; + b[490] = ZERO; + b[491] = ZERO; + b[492] = ZERO; + b[493] = ZERO; + b[494] = ZERO; + b[495] = ZERO; + b[496] = ZERO; + b[497] = ZERO; + b[498] = ZERO; + b[499] = ZERO; + b[500] = ZERO; + b[501] = ZERO; + b[502] = ZERO; + b[503] = ZERO; + b[504] = ZERO; + b[505] = ZERO; + b[506] = ZERO; + b[507] = ZERO; + b[508] = ZERO; + b[509] = ZERO; +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 32; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + b += 32; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[ 10] = *(a02 + 10); + b[ 11] = *(a02 + 11); + b[ 12] = *(a02 + 12); + b[ 13] = *(a02 + 13); + b[ 14] = *(a02 + 14); + b[ 15] = *(a02 + 15); + b[ 16] = *(a02 + 16); + b[ 17] = *(a02 + 17); + b[ 18] = *(a02 + 18); + b[ 19] = *(a02 + 19); + b[ 20] = *(a02 + 20); + b[ 21] = *(a02 + 21); + b[ 22] = *(a02 + 22); + b[ 23] = *(a02 + 23); + b[ 24] = *(a02 + 24); + b[ 25] = *(a02 + 25); + b[ 26] = *(a02 + 26); + b[ 27] = *(a02 + 27); + b[ 28] = *(a02 + 28); + b[ 29] = *(a02 + 29); + b[ 30] = *(a02 + 30); + b[ 31] = *(a02 + 31); + b += 32; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[ 10] = *(a03 + 10); + b[ 11] = *(a03 + 11); + b[ 12] = *(a03 + 12); + b[ 13] = *(a03 + 13); + b[ 14] = *(a03 + 14); + b[ 15] = *(a03 + 15); + b[ 16] = *(a03 + 16); + b[ 17] = *(a03 + 17); + b[ 18] = *(a03 + 18); + b[ 19] = *(a03 + 19); + b[ 20] = *(a03 + 20); + b[ 21] = *(a03 + 21); + b[ 22] = *(a03 + 22); + b[ 23] = *(a03 + 23); + b[ 24] = *(a03 + 24); + b[ 25] = *(a03 + 25); + b[ 26] = *(a03 + 26); + b[ 27] = *(a03 + 27); + b[ 28] = *(a03 + 28); + b[ 29] = *(a03 + 29); + b[ 30] = *(a03 + 30); + b[ 31] = *(a03 + 31); + b += 32; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[ 10] = *(a04 + 10); + b[ 11] = *(a04 + 11); + b[ 12] = *(a04 + 12); + b[ 13] = *(a04 + 13); + b[ 14] = *(a04 + 14); + b[ 15] = *(a04 + 15); + b[ 16] = *(a04 + 16); + b[ 17] = *(a04 + 17); + b[ 18] = *(a04 + 18); + b[ 19] = *(a04 + 19); + b[ 20] = *(a04 + 20); + b[ 21] = *(a04 + 21); + b[ 22] = *(a04 + 22); + b[ 23] = *(a04 + 23); + b[ 24] = *(a04 + 24); + b[ 25] = *(a04 + 25); + b[ 26] = *(a04 + 26); + b[ 27] = *(a04 + 27); + b[ 28] = *(a04 + 28); + b[ 29] = *(a04 + 29); + b[ 30] = *(a04 + 30); + b[ 31] = *(a04 + 31); + b += 32; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = *(a05 + 10); + b[ 11] = *(a05 + 11); + b[ 12] = *(a05 + 12); + b[ 13] = *(a05 + 13); + b[ 14] = *(a05 + 14); + b[ 15] = *(a05 + 15); + b[ 16] = *(a05 + 16); + b[ 17] = *(a05 + 17); + b[ 18] = *(a05 + 18); + b[ 19] = *(a05 + 19); + b[ 20] = *(a05 + 20); + b[ 21] = *(a05 + 21); + b[ 22] = *(a05 + 22); + b[ 23] = *(a05 + 23); + b[ 24] = *(a05 + 24); + b[ 25] = *(a05 + 25); + b[ 26] = *(a05 + 26); + b[ 27] = *(a05 + 27); + b[ 28] = *(a05 + 28); + b[ 29] = *(a05 + 29); + b[ 30] = *(a05 + 30); + b[ 31] = *(a05 + 31); + b += 32; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = *(a06 + 12); + b[ 13] = *(a06 + 13); + b[ 14] = *(a06 + 14); + b[ 15] = *(a06 + 15); + b[ 16] = *(a06 + 16); + b[ 17] = *(a06 + 17); + b[ 18] = *(a06 + 18); + b[ 19] = *(a06 + 19); + b[ 20] = *(a06 + 20); + b[ 21] = *(a06 + 21); + b[ 22] = *(a06 + 22); + b[ 23] = *(a06 + 23); + b[ 24] = *(a06 + 24); + b[ 25] = *(a06 + 25); + b[ 26] = *(a06 + 26); + b[ 27] = *(a06 + 27); + b[ 28] = *(a06 + 28); + b[ 29] = *(a06 + 29); + b[ 30] = *(a06 + 30); + b[ 31] = *(a06 + 31); + b += 32; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = *(a07 + 14); + b[ 15] = *(a07 + 15); + b[ 16] = *(a07 + 16); + b[ 17] = *(a07 + 17); + b[ 18] = *(a07 + 18); + b[ 19] = *(a07 + 19); + b[ 20] = *(a07 + 20); + b[ 21] = *(a07 + 21); + b[ 22] = *(a07 + 22); + b[ 23] = *(a07 + 23); + b[ 24] = *(a07 + 24); + b[ 25] = *(a07 + 25); + b[ 26] = *(a07 + 26); + b[ 27] = *(a07 + 27); + b[ 28] = *(a07 + 28); + b[ 29] = *(a07 + 29); + b[ 30] = *(a07 + 30); + b[ 31] = *(a07 + 31); + b += 32; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = *(a08 + 16); + b[ 17] = *(a08 + 17); + b[ 18] = *(a08 + 18); + b[ 19] = *(a08 + 19); + b[ 20] = *(a08 + 20); + b[ 21] = *(a08 + 21); + b[ 22] = *(a08 + 22); + b[ 23] = *(a08 + 23); + b[ 24] = *(a08 + 24); + b[ 25] = *(a08 + 25); + b[ 26] = *(a08 + 26); + b[ 27] = *(a08 + 27); + b[ 28] = *(a08 + 28); + b[ 29] = *(a08 + 29); + b[ 30] = *(a08 + 30); + b[ 31] = *(a08 + 31); + b += 32; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = *(a09 + 18); + b[ 19] = *(a09 + 19); + b[ 20] = *(a09 + 20); + b[ 21] = *(a09 + 21); + b[ 22] = *(a09 + 22); + b[ 23] = *(a09 + 23); + b[ 24] = *(a09 + 24); + b[ 25] = *(a09 + 25); + b[ 26] = *(a09 + 26); + b[ 27] = *(a09 + 27); + b[ 28] = *(a09 + 28); + b[ 29] = *(a09 + 29); + b[ 30] = *(a09 + 30); + b[ 31] = *(a09 + 31); + b += 32; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = *(a10 + 20); + b[ 21] = *(a10 + 21); + b[ 22] = *(a10 + 22); + b[ 23] = *(a10 + 23); + b[ 24] = *(a10 + 24); + b[ 25] = *(a10 + 25); + b[ 26] = *(a10 + 26); + b[ 27] = *(a10 + 27); + b[ 28] = *(a10 + 28); + b[ 29] = *(a10 + 29); + b[ 30] = *(a10 + 30); + b[ 31] = *(a10 + 31); + b += 32; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = *(a11 + 22); + b[ 23] = *(a11 + 23); + b[ 24] = *(a11 + 24); + b[ 25] = *(a11 + 25); + b[ 26] = *(a11 + 26); + b[ 27] = *(a11 + 27); + b[ 28] = *(a11 + 28); + b[ 29] = *(a11 + 29); + b[ 30] = *(a11 + 30); + b[ 31] = *(a11 + 31); + b += 32; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = *(a12 + 24); + b[ 25] = *(a12 + 25); + b[ 26] = *(a12 + 26); + b[ 27] = *(a12 + 27); + b[ 28] = *(a12 + 28); + b[ 29] = *(a12 + 29); + b[ 30] = *(a12 + 30); + b[ 31] = *(a12 + 31); + b += 32; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = *(a13 + 26); + b[ 27] = *(a13 + 27); + b[ 28] = *(a13 + 28); + b[ 29] = *(a12 + 29); + b[ 30] = *(a13 + 30); + b[ 31] = *(a13 + 31); + b += 32; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = *(a14 + 28); + b[ 29] = *(a14 + 29); + b[ 30] = *(a14 + 30); + b[ 31] = *(a14 + 31); + b += 32; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = *(a15 + 30); + b[ 31] = *(a15 + 31); + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X < posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = *(a02 + 4); + b[ 21] = *(a02 + 5); + b[ 22] = *(a02 + 6); + b[ 23] = *(a02 + 7); + b[ 24] = *(a02 + 8); + b[ 25] = *(a02 + 9); + b[ 26] = *(a02 + 10); + b[ 27] = *(a02 + 11); + b[ 28] = *(a02 + 12); + b[ 29] = *(a02 + 13); + b[ 30] = *(a02 + 14); + b[ 31] = *(a02 + 15); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = *(a03 + 6); + b[ 39] = *(a03 + 7); + b[ 40] = *(a03 + 8); + b[ 41] = *(a03 + 9); + b[ 42] = *(a03 + 10); + b[ 43] = *(a03 + 11); + b[ 44] = *(a03 + 12); + b[ 45] = *(a03 + 13); + b[ 46] = *(a03 + 14); + b[ 47] = *(a03 + 15); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = *(a04 + 8); + b[ 57] = *(a04 + 9); + b[ 58] = *(a04 + 10); + b[ 59] = *(a04 + 11); + b[ 60] = *(a04 + 12); + b[ 61] = *(a04 + 13); + b[ 62] = *(a04 + 14); + b[ 63] = *(a04 + 15); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = *(a05 + 10); + b[ 75] = *(a05 + 11); + b[ 76] = *(a05 + 12); + b[ 77] = *(a05 + 13); + b[ 78] = *(a05 + 14); + b[ 79] = *(a05 + 15); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = *(a06 + 12); + b[ 93] = *(a06 + 13); + b[ 94] = *(a06 + 14); + b[ 95] = *(a06 + 15); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = *(a07 + 14); + b[111] = *(a07 + 15); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; */ + b += 16 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b += 16; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[10] = *(a02 + 10); + b[11] = *(a02 + 11); + b[12] = *(a02 + 12); + b[13] = *(a02 + 13); + b[14] = *(a02 + 14); + b[15] = *(a02 + 15); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[10] = *(a03 + 10); + b[11] = *(a03 + 11); + b[12] = *(a03 + 12); + b[13] = *(a03 + 13); + b[14] = *(a03 + 14); + b[15] = *(a03 + 15); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[10] = *(a04 + 10); + b[11] = *(a04 + 11); + b[12] = *(a04 + 12); + b[13] = *(a04 + 13); + b[14] = *(a04 + 14); + b[15] = *(a04 + 15); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = *(a05 + 10); + b[11] = *(a05 + 11); + b[12] = *(a05 + 12); + b[13] = *(a05 + 13); + b[14] = *(a05 + 14); + b[15] = *(a05 + 15); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = *(a06 + 12); + b[13] = *(a06 + 13); + b[14] = *(a06 + 14); + b[15] = *(a06 + 15); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = *(a07 + 14); + b[15] = *(a07 + 15); + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = *(a02 + 4); + b[ 13] = *(a02 + 5); + b[ 14] = *(a02 + 6); + b[ 15] = *(a02 + 7); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = *(a03 + 6); + b[ 23] = *(a03 + 7); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i > 0) { + if (X > posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; */ + b += 8 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b += 8; + } + } + } + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + b[4] = *(a02 + 0); + b[5] = *(a02 + 1); + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + + b[4] = ZERO; + b[5] = ZERO; +#ifdef UNIT + b[6] = ONE; + b[7] = ZERO; +#else + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); +#endif + a01 += 4; + a02 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0) { + if (X > posY) { + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + + /* a01 += lda; + a02 += lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + a01 += 2; + b += 2; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + a01 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + // posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_16.c b/kernel/generic/ztrmm_uncopy_16.c new file mode 100644 index 000000000..40b85db38 --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_16.c @@ -0,0 +1,2316 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X > posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 512; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = *(a03 + 2); + b[ 37] = *(a03 + 3); + b[ 38] = *(a04 + 2); + b[ 39] = *(a04 + 3); + b[ 40] = *(a05 + 2); + b[ 41] = *(a05 + 3); + b[ 42] = *(a06 + 2); + b[ 43] = *(a06 + 3); + b[ 44] = *(a07 + 2); + b[ 45] = *(a07 + 3); + b[ 46] = *(a08 + 2); + b[ 47] = *(a08 + 3); + b[ 48] = *(a09 + 2); + b[ 49] = *(a09 + 3); + b[ 50] = *(a10 + 2); + b[ 51] = *(a10 + 3); + b[ 52] = *(a11 + 2); + b[ 53] = *(a11 + 3); + b[ 54] = *(a12 + 2); + b[ 55] = *(a12 + 3); + b[ 56] = *(a13 + 2); + b[ 57] = *(a13 + 3); + b[ 58] = *(a14 + 2); + b[ 59] = *(a14 + 3); + b[ 60] = *(a15 + 2); + b[ 61] = *(a15 + 3); + b[ 62] = *(a16 + 2); + b[ 63] = *(a16 + 3); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = *(a04 + 4); + b[ 71] = *(a04 + 5); + b[ 72] = *(a05 + 4); + b[ 73] = *(a05 + 5); + b[ 74] = *(a06 + 4); + b[ 75] = *(a06 + 5); + b[ 76] = *(a07 + 4); + b[ 77] = *(a07 + 5); + b[ 78] = *(a08 + 4); + b[ 79] = *(a08 + 5); + b[ 80] = *(a09 + 4); + b[ 81] = *(a09 + 5); + b[ 82] = *(a10 + 4); + b[ 83] = *(a10 + 5); + b[ 84] = *(a11 + 4); + b[ 85] = *(a11 + 5); + b[ 86] = *(a12 + 4); + b[ 87] = *(a12 + 5); + b[ 88] = *(a13 + 4); + b[ 89] = *(a13 + 5); + b[ 90] = *(a14 + 4); + b[ 91] = *(a14 + 5); + b[ 92] = *(a15 + 4); + b[ 93] = *(a15 + 5); + b[ 94] = *(a16 + 4); + b[ 95] = *(a16 + 5); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = *(a05 + 6); + b[105] = *(a05 + 7); + b[106] = *(a06 + 6); + b[107] = *(a06 + 7); + b[108] = *(a07 + 6); + b[109] = *(a07 + 7); + b[110] = *(a08 + 6); + b[111] = *(a08 + 7); + b[112] = *(a09 + 6); + b[113] = *(a09 + 7); + b[114] = *(a10 + 6); + b[115] = *(a10 + 7); + b[116] = *(a11 + 6); + b[117] = *(a11 + 7); + b[118] = *(a12 + 6); + b[119] = *(a12 + 7); + b[120] = *(a13 + 6); + b[121] = *(a13 + 7); + b[122] = *(a14 + 6); + b[123] = *(a14 + 7); + b[124] = *(a15 + 6); + b[125] = *(a15 + 7); + b[126] = *(a16 + 6); + b[127] = *(a16 + 7); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = *(a06 + 8); + b[139] = *(a06 + 9); + b[140] = *(a07 + 8); + b[141] = *(a07 + 9); + b[142] = *(a08 + 8); + b[143] = *(a08 + 9); + b[144] = *(a09 + 8); + b[145] = *(a09 + 9); + b[146] = *(a10 + 8); + b[147] = *(a10 + 9); + b[148] = *(a11 + 8); + b[149] = *(a11 + 9); + b[150] = *(a12 + 8); + b[151] = *(a12 + 9); + b[152] = *(a13 + 8); + b[153] = *(a13 + 9); + b[154] = *(a14 + 8); + b[155] = *(a14 + 9); + b[156] = *(a15 + 8); + b[157] = *(a15 + 9); + b[158] = *(a16 + 8); + b[159] = *(a16 + 9); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = *(a07 + 10); + b[173] = *(a07 + 11); + b[174] = *(a08 + 10); + b[175] = *(a08 + 11); + b[176] = *(a09 + 10); + b[177] = *(a09 + 11); + b[178] = *(a10 + 10); + b[179] = *(a10 + 11); + b[180] = *(a11 + 10); + b[181] = *(a11 + 11); + b[182] = *(a12 + 10); + b[183] = *(a12 + 11); + b[184] = *(a13 + 10); + b[185] = *(a13 + 11); + b[186] = *(a14 + 10); + b[187] = *(a14 + 11); + b[188] = *(a15 + 10); + b[189] = *(a15 + 11); + b[190] = *(a16 + 10); + b[191] = *(a16 + 11); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = *(a08 + 12); + b[207] = *(a08 + 13); + b[208] = *(a09 + 12); + b[209] = *(a09 + 13); + b[210] = *(a10 + 12); + b[211] = *(a10 + 13); + b[212] = *(a11 + 12); + b[213] = *(a11 + 13); + b[214] = *(a12 + 12); + b[215] = *(a12 + 13); + b[216] = *(a13 + 12); + b[217] = *(a13 + 13); + b[218] = *(a14 + 12); + b[219] = *(a14 + 13); + b[220] = *(a15 + 12); + b[221] = *(a15 + 13); + b[222] = *(a16 + 12); + b[223] = *(a16 + 13); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = *(a09 + 14); + b[241] = *(a09 + 15); + b[242] = *(a10 + 14); + b[243] = *(a10 + 15); + b[244] = *(a11 + 14); + b[245] = *(a11 + 15); + b[246] = *(a12 + 14); + b[247] = *(a12 + 15); + b[248] = *(a13 + 14); + b[249] = *(a13 + 15); + b[250] = *(a14 + 14); + b[251] = *(a14 + 15); + b[252] = *(a15 + 14); + b[253] = *(a15 + 15); + b[254] = *(a16 + 14); + b[255] = *(a16 + 15); + + b[256] = ZERO; + b[257] = ZERO; + b[258] = ZERO; + b[259] = ZERO; + b[260] = ZERO; + b[261] = ZERO; + b[262] = ZERO; + b[263] = ZERO; + b[264] = ZERO; + b[265] = ZERO; + b[266] = ZERO; + b[267] = ZERO; + b[268] = ZERO; + b[269] = ZERO; + b[270] = ZERO; + b[271] = ZERO; +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = *(a10 + 16); + b[275] = *(a10 + 17); + b[276] = *(a11 + 16); + b[277] = *(a11 + 17); + b[278] = *(a12 + 16); + b[279] = *(a12 + 17); + b[280] = *(a13 + 16); + b[281] = *(a13 + 17); + b[282] = *(a14 + 16); + b[283] = *(a14 + 17); + b[284] = *(a15 + 16); + b[285] = *(a15 + 17); + b[286] = *(a16 + 16); + b[287] = *(a16 + 17); + + b[288] = ZERO; + b[289] = ZERO; + b[290] = ZERO; + b[291] = ZERO; + b[292] = ZERO; + b[293] = ZERO; + b[294] = ZERO; + b[295] = ZERO; + b[296] = ZERO; + b[297] = ZERO; + b[298] = ZERO; + b[299] = ZERO; + b[300] = ZERO; + b[301] = ZERO; + b[302] = ZERO; + b[303] = ZERO; + b[304] = ZERO; + b[305] = ZERO; +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = *(a11 + 18); + b[309] = *(a11 + 19); + b[310] = *(a12 + 18); + b[311] = *(a12 + 19); + b[312] = *(a13 + 18); + b[313] = *(a13 + 19); + b[314] = *(a14 + 18); + b[315] = *(a14 + 19); + b[316] = *(a15 + 18); + b[317] = *(a15 + 19); + b[318] = *(a16 + 18); + b[319] = *(a16 + 19); + + b[320] = ZERO; + b[321] = ZERO; + b[322] = ZERO; + b[323] = ZERO; + b[324] = ZERO; + b[325] = ZERO; + b[326] = ZERO; + b[327] = ZERO; + b[328] = ZERO; + b[329] = ZERO; + b[330] = ZERO; + b[331] = ZERO; + b[332] = ZERO; + b[333] = ZERO; + b[334] = ZERO; + b[335] = ZERO; + b[336] = ZERO; + b[337] = ZERO; + b[338] = ZERO; + b[339] = ZERO; +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = *(a12 + 20); + b[343] = *(a12 + 21); + b[344] = *(a13 + 20); + b[345] = *(a13 + 21); + b[346] = *(a14 + 20); + b[347] = *(a14 + 21); + b[348] = *(a15 + 20); + b[349] = *(a15 + 21); + b[350] = *(a16 + 20); + b[351] = *(a16 + 21); + + b[352] = ZERO; + b[353] = ZERO; + b[354] = ZERO; + b[355] = ZERO; + b[356] = ZERO; + b[357] = ZERO; + b[358] = ZERO; + b[359] = ZERO; + b[360] = ZERO; + b[361] = ZERO; + b[362] = ZERO; + b[363] = ZERO; + b[364] = ZERO; + b[365] = ZERO; + b[366] = ZERO; + b[367] = ZERO; + b[368] = ZERO; + b[369] = ZERO; + b[370] = ZERO; + b[371] = ZERO; + b[372] = ZERO; + b[373] = ZERO; +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = *(a13 + 22); + b[377] = *(a13 + 23); + b[378] = *(a14 + 22); + b[379] = *(a14 + 23); + b[380] = *(a15 + 22); + b[381] = *(a15 + 23); + b[382] = *(a16 + 22); + b[383] = *(a16 + 23); + + b[384] = ZERO; + b[385] = ZERO; + b[386] = ZERO; + b[387] = ZERO; + b[388] = ZERO; + b[389] = ZERO; + b[390] = ZERO; + b[391] = ZERO; + b[392] = ZERO; + b[393] = ZERO; + b[394] = ZERO; + b[395] = ZERO; + b[396] = ZERO; + b[397] = ZERO; + b[398] = ZERO; + b[399] = ZERO; + b[400] = ZERO; + b[401] = ZERO; + b[402] = ZERO; + b[403] = ZERO; + b[404] = ZERO; + b[405] = ZERO; + b[406] = ZERO; + b[407] = ZERO; +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = *(a14 + 24); + b[411] = *(a14 + 25); + b[412] = *(a15 + 24); + b[413] = *(a15 + 25); + b[414] = *(a16 + 24); + b[415] = *(a16 + 25); + + b[416] = ZERO; + b[417] = ZERO; + b[418] = ZERO; + b[419] = ZERO; + b[420] = ZERO; + b[421] = ZERO; + b[422] = ZERO; + b[423] = ZERO; + b[424] = ZERO; + b[425] = ZERO; + b[426] = ZERO; + b[427] = ZERO; + b[428] = ZERO; + b[429] = ZERO; + b[430] = ZERO; + b[431] = ZERO; + b[432] = ZERO; + b[433] = ZERO; + b[434] = ZERO; + b[435] = ZERO; + b[436] = ZERO; + b[437] = ZERO; + b[438] = ZERO; + b[439] = ZERO; + b[440] = ZERO; + b[441] = ZERO; +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = *(a15 + 26); + b[445] = *(a15 + 27); + b[446] = *(a16 + 26); + b[447] = *(a16 + 27); + + b[448] = ZERO; + b[449] = ZERO; + b[450] = ZERO; + b[451] = ZERO; + b[452] = ZERO; + b[453] = ZERO; + b[454] = ZERO; + b[455] = ZERO; + b[456] = ZERO; + b[457] = ZERO; + b[458] = ZERO; + b[459] = ZERO; + b[460] = ZERO; + b[461] = ZERO; + b[462] = ZERO; + b[463] = ZERO; + b[464] = ZERO; + b[465] = ZERO; + b[466] = ZERO; + b[467] = ZERO; + b[468] = ZERO; + b[469] = ZERO; + b[470] = ZERO; + b[471] = ZERO; + b[472] = ZERO; + b[473] = ZERO; + b[474] = ZERO; + b[475] = ZERO; +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = *(a16 + 28); + b[479] = *(a16 + 29); + + b[480] = ZERO; + b[481] = ZERO; + b[482] = ZERO; + b[483] = ZERO; + b[484] = ZERO; + b[485] = ZERO; + b[486] = ZERO; + b[487] = ZERO; + b[488] = ZERO; + b[489] = ZERO; + b[490] = ZERO; + b[491] = ZERO; + b[492] = ZERO; + b[493] = ZERO; + b[494] = ZERO; + b[495] = ZERO; + b[496] = ZERO; + b[497] = ZERO; + b[498] = ZERO; + b[499] = ZERO; + b[500] = ZERO; + b[501] = ZERO; + b[502] = ZERO; + b[503] = ZERO; + b[504] = ZERO; + b[505] = ZERO; + b[506] = ZERO; + b[507] = ZERO; + b[508] = ZERO; + b[509] = ZERO; +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + b += 32; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b[ 8] = *(a05 + 2); + b[ 9] = *(a05 + 3); + b[ 10] = *(a06 + 2); + b[ 11] = *(a06 + 3); + b[ 12] = *(a07 + 2); + b[ 13] = *(a07 + 3); + b[ 14] = *(a08 + 2); + b[ 15] = *(a08 + 3); + b[ 16] = *(a09 + 2); + b[ 17] = *(a09 + 3); + b[ 18] = *(a10 + 2); + b[ 19] = *(a10 + 3); + b[ 20] = *(a11 + 2); + b[ 21] = *(a11 + 3); + b[ 22] = *(a12 + 2); + b[ 23] = *(a12 + 3); + b[ 24] = *(a13 + 2); + b[ 25] = *(a13 + 3); + b[ 26] = *(a14 + 2); + b[ 27] = *(a14 + 3); + b[ 28] = *(a15 + 2); + b[ 29] = *(a15 + 3); + b[ 30] = *(a16 + 2); + b[ 31] = *(a16 + 3); + b += 32; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b[ 8] = *(a05 + 4); + b[ 9] = *(a05 + 5); + b[ 10] = *(a06 + 4); + b[ 11] = *(a06 + 5); + b[ 12] = *(a07 + 4); + b[ 13] = *(a07 + 5); + b[ 14] = *(a08 + 4); + b[ 15] = *(a08 + 5); + b[ 16] = *(a09 + 4); + b[ 17] = *(a09 + 5); + b[ 18] = *(a10 + 4); + b[ 19] = *(a10 + 5); + b[ 20] = *(a11 + 4); + b[ 21] = *(a11 + 5); + b[ 22] = *(a12 + 4); + b[ 23] = *(a12 + 5); + b[ 24] = *(a13 + 4); + b[ 25] = *(a13 + 5); + b[ 26] = *(a14 + 4); + b[ 27] = *(a14 + 5); + b[ 28] = *(a15 + 4); + b[ 29] = *(a15 + 5); + b[ 30] = *(a16 + 4); + b[ 31] = *(a16 + 5); + b += 32; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a05 + 6); + b[ 9] = *(a05 + 7); + b[ 10] = *(a06 + 6); + b[ 11] = *(a06 + 7); + b[ 12] = *(a07 + 6); + b[ 13] = *(a07 + 7); + b[ 14] = *(a08 + 6); + b[ 15] = *(a08 + 7); + b[ 16] = *(a09 + 6); + b[ 17] = *(a09 + 7); + b[ 18] = *(a10 + 6); + b[ 19] = *(a10 + 7); + b[ 20] = *(a11 + 6); + b[ 21] = *(a11 + 7); + b[ 22] = *(a12 + 6); + b[ 23] = *(a12 + 7); + b[ 24] = *(a13 + 6); + b[ 25] = *(a13 + 7); + b[ 26] = *(a14 + 6); + b[ 27] = *(a14 + 7); + b[ 28] = *(a15 + 6); + b[ 29] = *(a15 + 7); + b[ 30] = *(a16 + 6); + b[ 31] = *(a16 + 7); + b += 32; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = *(a06 + 8); + b[ 11] = *(a06 + 9); + b[ 12] = *(a07 + 8); + b[ 13] = *(a07 + 9); + b[ 14] = *(a08 + 8); + b[ 15] = *(a08 + 9); + b[ 16] = *(a09 + 8); + b[ 17] = *(a09 + 9); + b[ 18] = *(a10 + 8); + b[ 19] = *(a10 + 9); + b[ 20] = *(a11 + 8); + b[ 21] = *(a11 + 9); + b[ 22] = *(a12 + 8); + b[ 23] = *(a12 + 9); + b[ 24] = *(a13 + 8); + b[ 25] = *(a13 + 9); + b[ 26] = *(a14 + 8); + b[ 27] = *(a14 + 9); + b[ 28] = *(a15 + 8); + b[ 29] = *(a15 + 9); + b[ 30] = *(a16 + 8); + b[ 31] = *(a16 + 9); + b += 32; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = *(a07 + 10); + b[ 13] = *(a07 + 11); + b[ 14] = *(a08 + 10); + b[ 15] = *(a08 + 11); + b[ 16] = *(a09 + 10); + b[ 17] = *(a09 + 11); + b[ 18] = *(a10 + 10); + b[ 19] = *(a10 + 11); + b[ 20] = *(a11 + 10); + b[ 21] = *(a11 + 11); + b[ 22] = *(a12 + 10); + b[ 23] = *(a12 + 11); + b[ 24] = *(a13 + 10); + b[ 25] = *(a13 + 11); + b[ 26] = *(a14 + 10); + b[ 27] = *(a14 + 11); + b[ 28] = *(a15 + 10); + b[ 29] = *(a15 + 11); + b[ 30] = *(a16 + 10); + b[ 31] = *(a16 + 11); + b += 32; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = *(a08 + 12); + b[ 15] = *(a08 + 13); + b[ 16] = *(a09 + 12); + b[ 17] = *(a09 + 13); + b[ 18] = *(a10 + 12); + b[ 19] = *(a10 + 13); + b[ 20] = *(a11 + 12); + b[ 21] = *(a11 + 13); + b[ 22] = *(a12 + 12); + b[ 23] = *(a12 + 13); + b[ 24] = *(a13 + 12); + b[ 25] = *(a13 + 13); + b[ 26] = *(a14 + 12); + b[ 27] = *(a14 + 13); + b[ 28] = *(a15 + 12); + b[ 29] = *(a15 + 13); + b[ 30] = *(a16 + 12); + b[ 31] = *(a16 + 13); + b += 32; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = *(a09 + 14); + b[ 17] = *(a09 + 15); + b[ 18] = *(a10 + 14); + b[ 19] = *(a10 + 15); + b[ 20] = *(a11 + 14); + b[ 21] = *(a11 + 15); + b[ 22] = *(a12 + 14); + b[ 23] = *(a12 + 15); + b[ 24] = *(a13 + 14); + b[ 25] = *(a13 + 15); + b[ 26] = *(a14 + 14); + b[ 27] = *(a14 + 15); + b[ 28] = *(a15 + 14); + b[ 29] = *(a15 + 15); + b[ 30] = *(a16 + 14); + b[ 31] = *(a16 + 15); + b += 32; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = *(a10 + 16); + b[ 19] = *(a10 + 17); + b[ 20] = *(a11 + 16); + b[ 21] = *(a11 + 17); + b[ 22] = *(a12 + 16); + b[ 23] = *(a12 + 17); + b[ 24] = *(a13 + 16); + b[ 25] = *(a13 + 17); + b[ 26] = *(a14 + 16); + b[ 27] = *(a14 + 17); + b[ 28] = *(a15 + 16); + b[ 29] = *(a15 + 17); + b[ 30] = *(a16 + 16); + b[ 31] = *(a16 + 17); + b += 32; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = *(a11 + 18); + b[ 21] = *(a11 + 19); + b[ 22] = *(a12 + 18); + b[ 23] = *(a12 + 19); + b[ 24] = *(a13 + 18); + b[ 25] = *(a13 + 19); + b[ 26] = *(a14 + 18); + b[ 27] = *(a14 + 19); + b[ 28] = *(a15 + 18); + b[ 29] = *(a15 + 19); + b[ 30] = *(a16 + 18); + b[ 31] = *(a16 + 19); + b += 32; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = *(a12 + 20); + b[ 23] = *(a12 + 21); + b[ 24] = *(a13 + 20); + b[ 25] = *(a13 + 21); + b[ 26] = *(a14 + 20); + b[ 27] = *(a14 + 21); + b[ 28] = *(a15 + 20); + b[ 29] = *(a15 + 21); + b[ 30] = *(a16 + 20); + b[ 31] = *(a16 + 21); + b += 32; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = *(a13 + 22); + b[ 25] = *(a13 + 23); + b[ 26] = *(a14 + 22); + b[ 27] = *(a14 + 23); + b[ 28] = *(a15 + 22); + b[ 29] = *(a15 + 23); + b[ 30] = *(a16 + 22); + b[ 31] = *(a16 + 23); + b += 32; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = *(a14 + 24); + b[ 27] = *(a14 + 25); + b[ 28] = *(a15 + 24); + b[ 29] = *(a15 + 25); + b[ 30] = *(a16 + 24); + b[ 31] = *(a16 + 25); + b += 32; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = *(a15 + 26); + b[ 29] = *(a15 + 27); + b[ 30] = *(a16 + 26); + b[ 31] = *(a16 + 27); + b += 32; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = *(a16 + 28); + b[ 31] = *(a16 + 29); + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X > posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + b += 128; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = *(a03 + 2); + b[ 21] = *(a03 + 3); + b[ 22] = *(a04 + 2); + b[ 23] = *(a04 + 3); + b[ 24] = *(a05 + 2); + b[ 25] = *(a05 + 3); + b[ 26] = *(a06 + 2); + b[ 27] = *(a06 + 3); + b[ 28] = *(a07 + 2); + b[ 29] = *(a07 + 3); + b[ 30] = *(a08 + 2); + b[ 31] = *(a08 + 3); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = *(a04 + 4); + b[ 39] = *(a04 + 5); + b[ 40] = *(a05 + 4); + b[ 41] = *(a05 + 5); + b[ 42] = *(a06 + 4); + b[ 43] = *(a06 + 5); + b[ 44] = *(a07 + 4); + b[ 45] = *(a07 + 5); + b[ 46] = *(a08 + 4); + b[ 47] = *(a08 + 5); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = *(a05 + 6); + b[ 57] = *(a05 + 7); + b[ 58] = *(a06 + 6); + b[ 59] = *(a06 + 7); + b[ 60] = *(a07 + 6); + b[ 61] = *(a07 + 7); + b[ 62] = *(a08 + 6); + b[ 63] = *(a08 + 7); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = *(a06 + 8); + b[ 75] = *(a06 + 9); + b[ 76] = *(a07 + 8); + b[ 77] = *(a07 + 9); + b[ 78] = *(a08 + 8); + b[ 79] = *(a08 + 9); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = *(a07 + 10); + b[ 93] = *(a07 + 11); + b[ 94] = *(a08 + 10); + b[ 95] = *(a08 + 11); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = *(a08 + 12); + b[111] = *(a08 + 13); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; */ + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[10] = *(a06 + 0); + b[11] = *(a06 + 1); + b[12] = *(a07 + 0); + b[13] = *(a07 + 1); + b[14] = *(a08 + 0); + b[15] = *(a08 + 1); + b += 16; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b[ 8] = *(a05 + 2); + b[ 9] = *(a05 + 3); + b[10] = *(a06 + 2); + b[11] = *(a06 + 3); + b[12] = *(a07 + 2); + b[13] = *(a07 + 3); + b[14] = *(a08 + 2); + b[15] = *(a08 + 3); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b[ 8] = *(a05 + 4); + b[ 9] = *(a05 + 5); + b[10] = *(a06 + 4); + b[11] = *(a06 + 5); + b[12] = *(a07 + 4); + b[13] = *(a07 + 5); + b[14] = *(a08 + 4); + b[15] = *(a08 + 5); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a05 + 6); + b[ 9] = *(a05 + 7); + b[10] = *(a06 + 6); + b[11] = *(a06 + 7); + b[12] = *(a07 + 6); + b[13] = *(a07 + 7); + b[14] = *(a08 + 6); + b[15] = *(a08 + 7); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = *(a06 + 8); + b[11] = *(a06 + 9); + b[12] = *(a07 + 8); + b[13] = *(a07 + 9); + b[14] = *(a08 + 8); + b[15] = *(a08 + 9); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = *(a07 + 10); + b[13] = *(a07 + 11); + b[14] = *(a08 + 10); + b[15] = *(a08 + 11); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = *(a08 + 12); + b[15] = *(a08 + 13); + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X > posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = *(a03 + 2); + b[ 13] = *(a03 + 3); + b[ 14] = *(a04 + 2); + b[ 15] = *(a04 + 3); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = *(a04 + 4); + b[ 23] = *(a04 + 5); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; */ + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b += 8; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 4; + a02 += 4; + b += 8; + } else + if (X > posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X > posY) { + /* a01 += 2 * lda; + a02 += 2 * lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += 2; + b += 2; + } else + if (X > posY) { + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + a01 += lda; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_16.c b/kernel/generic/ztrmm_utcopy_16.c new file mode 100644 index 000000000..5aba3727a --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_16.c @@ -0,0 +1,2318 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + + b += 512; + } else + if (X > posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + b += 32; + } + + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a02 + 0); + b[ 33] = *(a02 + 1); +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a03 + 0); + b[ 65] = *(a03 + 1); + b[ 66] = *(a03 + 2); + b[ 67] = *(a03 + 3); +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a04 + 0); + b[ 97] = *(a04 + 1); + b[ 98] = *(a04 + 2); + b[ 99] = *(a04 + 3); + b[100] = *(a04 + 4); + b[101] = *(a04 + 5); +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a05 + 0); + b[129] = *(a05 + 1); + b[130] = *(a05 + 2); + b[131] = *(a05 + 3); + b[132] = *(a05 + 4); + b[133] = *(a05 + 5); + b[134] = *(a05 + 6); + b[135] = *(a05 + 7); +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; + b[153] = ZERO; + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a06 + 0); + b[161] = *(a06 + 1); + b[162] = *(a06 + 2); + b[163] = *(a06 + 3); + b[164] = *(a06 + 4); + b[165] = *(a06 + 5); + b[166] = *(a06 + 6); + b[167] = *(a06 + 7); + b[168] = *(a06 + 8); + b[169] = *(a06 + 9); +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; + b[187] = ZERO; + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a07 + 0); + b[193] = *(a07 + 1); + b[194] = *(a07 + 2); + b[195] = *(a07 + 3); + b[196] = *(a07 + 4); + b[197] = *(a07 + 5); + b[198] = *(a07 + 6); + b[199] = *(a07 + 7); + b[200] = *(a07 + 8); + b[201] = *(a07 + 9); + b[202] = *(a07 + 10); + b[203] = *(a07 + 11); +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = ZERO; + b[207] = ZERO; + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; + b[221] = ZERO; + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a08 + 0); + b[225] = *(a08 + 1); + b[226] = *(a08 + 2); + b[227] = *(a08 + 3); + b[228] = *(a08 + 4); + b[229] = *(a08 + 5); + b[230] = *(a08 + 6); + b[231] = *(a08 + 7); + b[232] = *(a08 + 8); + b[233] = *(a08 + 9); + b[234] = *(a08 + 10); + b[235] = *(a08 + 11); + b[236] = *(a08 + 12); + b[237] = *(a08 + 13); +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; + b[255] = ZERO; + + b[256] = *(a09 + 0); + b[257] = *(a09 + 1); + b[258] = *(a09 + 2); + b[259] = *(a09 + 3); + b[260] = *(a09 + 4); + b[261] = *(a09 + 5); + b[262] = *(a09 + 6); + b[263] = *(a09 + 7); + b[264] = *(a09 + 8); + b[265] = *(a09 + 9); + b[266] = *(a09 + 10); + b[267] = *(a09 + 11); + b[268] = *(a09 + 12); + b[269] = *(a09 + 13); + b[270] = *(a09 + 14); + b[271] = *(a09 + 15); +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = ZERO; + b[275] = ZERO; + b[276] = ZERO; + b[277] = ZERO; + b[278] = ZERO; + b[279] = ZERO; + b[280] = ZERO; + b[281] = ZERO; + b[282] = ZERO; + b[283] = ZERO; + b[284] = ZERO; + b[285] = ZERO; + b[286] = ZERO; + b[287] = ZERO; + + b[288] = *(a10 + 0); + b[289] = *(a10 + 1); + b[290] = *(a10 + 2); + b[291] = *(a10 + 3); + b[292] = *(a10 + 4); + b[293] = *(a10 + 5); + b[294] = *(a10 + 6); + b[295] = *(a10 + 7); + b[296] = *(a10 + 8); + b[297] = *(a10 + 9); + b[298] = *(a10 + 10); + b[299] = *(a10 + 11); + b[300] = *(a10 + 12); + b[301] = *(a10 + 13); + b[302] = *(a10 + 14); + b[303] = *(a10 + 15); + b[304] = *(a10 + 16); + b[305] = *(a10 + 17); +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = ZERO; + b[309] = ZERO; + b[310] = ZERO; + b[311] = ZERO; + b[312] = ZERO; + b[313] = ZERO; + b[314] = ZERO; + b[315] = ZERO; + b[316] = ZERO; + b[317] = ZERO; + b[318] = ZERO; + b[319] = ZERO; + + b[320] = *(a11 + 0); + b[321] = *(a11 + 1); + b[322] = *(a11 + 2); + b[323] = *(a11 + 3); + b[324] = *(a11 + 4); + b[325] = *(a11 + 5); + b[326] = *(a11 + 6); + b[327] = *(a11 + 7); + b[328] = *(a11 + 8); + b[329] = *(a11 + 9); + b[330] = *(a11 + 10); + b[331] = *(a11 + 11); + b[332] = *(a11 + 12); + b[333] = *(a11 + 13); + b[334] = *(a11 + 14); + b[335] = *(a11 + 15); + b[336] = *(a11 + 16); + b[337] = *(a11 + 17); + b[338] = *(a11 + 18); + b[339] = *(a11 + 19); +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = ZERO; + b[343] = ZERO; + b[344] = ZERO; + b[345] = ZERO; + b[346] = ZERO; + b[347] = ZERO; + b[348] = ZERO; + b[349] = ZERO; + b[350] = ZERO; + b[351] = ZERO; + + b[352] = *(a12 + 0); + b[353] = *(a12 + 1); + b[354] = *(a12 + 2); + b[355] = *(a12 + 3); + b[356] = *(a12 + 4); + b[357] = *(a12 + 5); + b[358] = *(a12 + 6); + b[359] = *(a12 + 7); + b[360] = *(a12 + 8); + b[361] = *(a12 + 9); + b[362] = *(a12 + 10); + b[363] = *(a12 + 11); + b[364] = *(a12 + 12); + b[365] = *(a12 + 13); + b[366] = *(a12 + 14); + b[367] = *(a12 + 15); + b[368] = *(a12 + 16); + b[369] = *(a12 + 17); + b[370] = *(a12 + 18); + b[371] = *(a12 + 19); + b[372] = *(a12 + 20); + b[373] = *(a12 + 21); +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = ZERO; + b[377] = ZERO; + b[378] = ZERO; + b[379] = ZERO; + b[380] = ZERO; + b[381] = ZERO; + b[382] = ZERO; + b[383] = ZERO; + + b[384] = *(a13 + 0); + b[385] = *(a13 + 1); + b[386] = *(a13 + 2); + b[387] = *(a13 + 3); + b[388] = *(a13 + 4); + b[389] = *(a13 + 5); + b[390] = *(a13 + 6); + b[391] = *(a13 + 7); + b[392] = *(a13 + 8); + b[393] = *(a13 + 9); + b[394] = *(a13 + 10); + b[395] = *(a13 + 11); + b[396] = *(a13 + 12); + b[397] = *(a13 + 13); + b[398] = *(a13 + 14); + b[399] = *(a13 + 15); + b[400] = *(a13 + 16); + b[401] = *(a13 + 17); + b[402] = *(a13 + 18); + b[403] = *(a13 + 19); + b[404] = *(a13 + 20); + b[405] = *(a13 + 21); + b[406] = *(a13 + 22); + b[407] = *(a13 + 23); +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = ZERO; + b[411] = ZERO; + b[412] = ZERO; + b[413] = ZERO; + b[414] = ZERO; + b[415] = ZERO; + + b[416] = *(a14 + 0); + b[417] = *(a14 + 1); + b[418] = *(a14 + 2); + b[419] = *(a14 + 3); + b[420] = *(a14 + 4); + b[421] = *(a14 + 5); + b[422] = *(a14 + 6); + b[423] = *(a14 + 7); + b[424] = *(a14 + 8); + b[425] = *(a14 + 9); + b[426] = *(a14 + 10); + b[427] = *(a14 + 11); + b[428] = *(a14 + 12); + b[429] = *(a14 + 13); + b[430] = *(a14 + 14); + b[431] = *(a14 + 15); + b[432] = *(a14 + 16); + b[433] = *(a14 + 17); + b[434] = *(a14 + 18); + b[435] = *(a14 + 19); + b[436] = *(a14 + 20); + b[437] = *(a14 + 21); + b[438] = *(a14 + 22); + b[439] = *(a14 + 23); + b[440] = *(a14 + 24); + b[441] = *(a14 + 25); +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = ZERO; + b[445] = ZERO; + b[446] = ZERO; + b[447] = ZERO; + + b[448] = *(a15 + 0); + b[449] = *(a15 + 1); + b[450] = *(a15 + 2); + b[451] = *(a15 + 3); + b[452] = *(a15 + 4); + b[453] = *(a15 + 5); + b[454] = *(a15 + 6); + b[455] = *(a15 + 7); + b[456] = *(a15 + 8); + b[457] = *(a15 + 9); + b[458] = *(a15 + 10); + b[459] = *(a15 + 11); + b[460] = *(a15 + 12); + b[461] = *(a15 + 13); + b[462] = *(a15 + 14); + b[463] = *(a15 + 15); + b[464] = *(a15 + 16); + b[465] = *(a15 + 17); + b[466] = *(a15 + 18); + b[467] = *(a15 + 19); + b[468] = *(a15 + 20); + b[469] = *(a15 + 21); + b[470] = *(a15 + 22); + b[471] = *(a15 + 23); + b[472] = *(a15 + 24); + b[473] = *(a15 + 25); + b[474] = *(a15 + 26); + b[475] = *(a15 + 27); +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = ZERO; + b[479] = ZERO; + + b[480] = *(a16 + 0); + b[481] = *(a16 + 1); + b[482] = *(a16 + 2); + b[483] = *(a16 + 3); + b[484] = *(a16 + 4); + b[485] = *(a16 + 5); + b[486] = *(a16 + 6); + b[487] = *(a16 + 7); + b[488] = *(a16 + 8); + b[489] = *(a16 + 9); + b[490] = *(a16 + 10); + b[491] = *(a16 + 11); + b[492] = *(a16 + 12); + b[493] = *(a16 + 13); + b[494] = *(a16 + 14); + b[495] = *(a16 + 15); + b[496] = *(a16 + 16); + b[497] = *(a16 + 17); + b[498] = *(a16 + 18); + b[499] = *(a16 + 19); + b[500] = *(a16 + 20); + b[501] = *(a16 + 21); + b[502] = *(a16 + 22); + b[503] = *(a16 + 23); + b[504] = *(a16 + 24); + b[505] = *(a16 + 25); + b[506] = *(a16 + 26); + b[507] = *(a16 + 27); + b[508] = *(a16 + 28); + b[509] = *(a16 + 29); +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X < posY) { + // a01 += 2 * i; + // a02 += 2 * i; + // a03 += 2 * i; + // a04 += 2 * i; + // a05 += 2 * i; + // a06 += 2 * i; + // a07 += 2 * i; + // a08 += 2 * i; + // a09 += 2 * i; + // a10 += 2 * i; + // a11 += 2 * i; + // a12 += 2 * i; + // a13 += 2 * i; + // a14 += 2 * i; + // a15 += 2 * i; + // a16 += 2 * i; + b += 32 * i; + + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 32; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + + if (i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); + b[ 3] = *(a04 + 3); + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); + b[ 4] = *(a05 + 4); + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); + b[ 5] = *(a06 + 5); + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); + b[ 6] = *(a07 + 6); + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 8) { + b[ 0] = *(a08 + 0); + b[ 1] = *(a08 + 1); + b[ 2] = *(a08 + 2); + b[ 3] = *(a08 + 3); + b[ 4] = *(a08 + 4); + b[ 5] = *(a08 + 5); + b[ 6] = *(a08 + 6); + b[ 7] = *(a08 + 7); + b[ 8] = *(a08 + 8); + b[ 9] = *(a08 + 9); + b[ 10] = *(a08 + 10); + b[ 11] = *(a08 + 11); + b[ 12] = *(a08 + 12); + b[ 13] = *(a08 + 13); +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 9) { + b[ 0] = *(a09 + 0); + b[ 1] = *(a09 + 1); + b[ 2] = *(a09 + 2); + b[ 3] = *(a09 + 3); + b[ 4] = *(a09 + 4); + b[ 5] = *(a09 + 5); + b[ 6] = *(a09 + 6); + b[ 7] = *(a09 + 7); + b[ 8] = *(a09 + 8); + b[ 9] = *(a09 + 9); + b[ 10] = *(a09 + 10); + b[ 11] = *(a09 + 11); + b[ 12] = *(a09 + 12); + b[ 13] = *(a09 + 13); + b[ 14] = *(a09 + 14); + b[ 15] = *(a09 + 15); +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 10) { + b[ 0] = *(a10 + 0); + b[ 1] = *(a10 + 1); + b[ 2] = *(a10 + 2); + b[ 3] = *(a10 + 3); + b[ 4] = *(a10 + 4); + b[ 5] = *(a10 + 5); + b[ 6] = *(a10 + 6); + b[ 7] = *(a10 + 7); + b[ 8] = *(a10 + 8); + b[ 9] = *(a10 + 9); + b[ 10] = *(a10 + 10); + b[ 11] = *(a10 + 11); + b[ 12] = *(a10 + 12); + b[ 13] = *(a10 + 13); + b[ 14] = *(a10 + 14); + b[ 15] = *(a10 + 15); + b[ 16] = *(a10 + 16); + b[ 17] = *(a10 + 17); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 11) { + b[ 0] = *(a11 + 0); + b[ 1] = *(a11 + 1); + b[ 2] = *(a11 + 2); + b[ 3] = *(a11 + 3); + b[ 4] = *(a11 + 4); + b[ 5] = *(a11 + 5); + b[ 6] = *(a11 + 6); + b[ 7] = *(a11 + 7); + b[ 8] = *(a11 + 8); + b[ 9] = *(a11 + 9); + b[ 10] = *(a11 + 10); + b[ 11] = *(a11 + 11); + b[ 12] = *(a11 + 12); + b[ 13] = *(a11 + 13); + b[ 14] = *(a11 + 14); + b[ 15] = *(a11 + 15); + b[ 16] = *(a11 + 16); + b[ 17] = *(a11 + 17); + b[ 18] = *(a11 + 18); + b[ 19] = *(a11 + 19); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 12) { + b[ 0] = *(a12 + 0); + b[ 1] = *(a12 + 1); + b[ 2] = *(a12 + 2); + b[ 3] = *(a12 + 3); + b[ 4] = *(a12 + 4); + b[ 5] = *(a12 + 5); + b[ 6] = *(a12 + 6); + b[ 7] = *(a12 + 7); + b[ 8] = *(a12 + 8); + b[ 9] = *(a12 + 9); + b[ 10] = *(a12 + 10); + b[ 11] = *(a12 + 11); + b[ 12] = *(a12 + 12); + b[ 13] = *(a12 + 13); + b[ 14] = *(a12 + 14); + b[ 15] = *(a12 + 15); + b[ 16] = *(a12 + 16); + b[ 17] = *(a12 + 17); + b[ 18] = *(a12 + 18); + b[ 19] = *(a12 + 19); + b[ 20] = *(a12 + 20); + b[ 21] = *(a12 + 21); +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 13) { + b[ 0] = *(a13 + 0); + b[ 1] = *(a13 + 1); + b[ 2] = *(a13 + 2); + b[ 3] = *(a13 + 3); + b[ 4] = *(a13 + 4); + b[ 5] = *(a13 + 5); + b[ 6] = *(a13 + 6); + b[ 7] = *(a13 + 7); + b[ 8] = *(a13 + 8); + b[ 9] = *(a13 + 9); + b[ 10] = *(a13 + 10); + b[ 11] = *(a13 + 11); + b[ 12] = *(a13 + 12); + b[ 13] = *(a13 + 13); + b[ 14] = *(a13 + 14); + b[ 15] = *(a13 + 15); + b[ 16] = *(a13 + 16); + b[ 17] = *(a13 + 17); + b[ 18] = *(a13 + 18); + b[ 19] = *(a13 + 19); + b[ 20] = *(a13 + 20); + b[ 21] = *(a13 + 21); + b[ 22] = *(a13 + 22); + b[ 23] = *(a13 + 23); +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 14) { + b[ 0] = *(a14 + 0); + b[ 1] = *(a14 + 1); + b[ 2] = *(a14 + 2); + b[ 3] = *(a14 + 3); + b[ 4] = *(a14 + 4); + b[ 5] = *(a14 + 5); + b[ 6] = *(a14 + 6); + b[ 7] = *(a14 + 7); + b[ 8] = *(a14 + 8); + b[ 9] = *(a14 + 9); + b[ 10] = *(a14 + 10); + b[ 11] = *(a14 + 11); + b[ 12] = *(a14 + 12); + b[ 13] = *(a14 + 13); + b[ 14] = *(a14 + 14); + b[ 15] = *(a14 + 15); + b[ 16] = *(a14 + 16); + b[ 17] = *(a14 + 17); + b[ 18] = *(a14 + 18); + b[ 19] = *(a14 + 19); + b[ 20] = *(a14 + 20); + b[ 21] = *(a14 + 21); + b[ 22] = *(a14 + 22); + b[ 23] = *(a14 + 23); + b[ 24] = *(a14 + 24); + b[ 25] = *(a14 + 25); +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 15) { + b[ 0] = *(a15 + 0); + b[ 1] = *(a15 + 1); + b[ 2] = *(a15 + 2); + b[ 3] = *(a15 + 3); + b[ 4] = *(a15 + 4); + b[ 5] = *(a15 + 5); + b[ 6] = *(a15 + 6); + b[ 7] = *(a15 + 7); + b[ 8] = *(a15 + 8); + b[ 9] = *(a15 + 9); + b[ 10] = *(a15 + 10); + b[ 11] = *(a15 + 11); + b[ 12] = *(a15 + 12); + b[ 13] = *(a15 + 13); + b[ 14] = *(a15 + 14); + b[ 15] = *(a15 + 15); + b[ 16] = *(a15 + 16); + b[ 17] = *(a15 + 17); + b[ 18] = *(a15 + 18); + b[ 19] = *(a15 + 19); + b[ 20] = *(a15 + 20); + b[ 21] = *(a15 + 21); + b[ 22] = *(a15 + 22); + b[ 23] = *(a15 + 23); + b[ 24] = *(a15 + 24); + b[ 25] = *(a15 + 25); + b[ 26] = *(a15 + 26); + b[ 27] = *(a15 + 27); +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a02 + 0); + b[ 17] = *(a02 + 1); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a03 + 0); + b[ 33] = *(a03 + 1); + b[ 34] = *(a03 + 2); + b[ 35] = *(a03 + 3); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a04 + 0); + b[ 49] = *(a04 + 1); + b[ 50] = *(a04 + 2); + b[ 51] = *(a04 + 3); + b[ 52] = *(a04 + 4); + b[ 53] = *(a04 + 5); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a05 + 0); + b[ 65] = *(a05 + 1); + b[ 66] = *(a05 + 2); + b[ 67] = *(a05 + 3); + b[ 68] = *(a05 + 4); + b[ 69] = *(a05 + 5); + b[ 70] = *(a05 + 6); + b[ 71] = *(a05 + 7); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a06 + 0); + b[ 81] = *(a06 + 1); + b[ 82] = *(a06 + 2); + b[ 83] = *(a06 + 3); + b[ 84] = *(a06 + 4); + b[ 85] = *(a06 + 5); + b[ 86] = *(a06 + 6); + b[ 87] = *(a06 + 7); + b[ 88] = *(a06 + 8); + b[ 89] = *(a06 + 9); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a07 + 0); + b[ 97] = *(a07 + 1); + b[ 98] = *(a07 + 2); + b[ 99] = *(a07 + 3); + b[100] = *(a07 + 4); + b[101] = *(a07 + 5); + b[102] = *(a07 + 6); + b[103] = *(a07 + 7); + b[104] = *(a07 + 8); + b[105] = *(a07 + 9); + b[106] = *(a07 + 10); + b[107] = *(a07 + 11); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a08 + 0); + b[113] = *(a08 + 1); + b[114] = *(a08 + 2); + b[115] = *(a08 + 3); + b[116] = *(a08 + 4); + b[117] = *(a08 + 5); + b[118] = *(a08 + 6); + b[119] = *(a08 + 7); + b[120] = *(a08 + 8); + b[121] = *(a08 + 9); + b[122] = *(a08 + 10); + b[123] = *(a08 + 11); + b[124] = *(a08 + 12); + b[125] = *(a08 + 13); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; */ + b += 16 * i; + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); + b[ 3] = *(a04 + 3); + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); + b[ 4] = *(a05 + 4); + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); + b[ 5] = *(a06 + 5); + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); + b[ 6] = *(a07 + 6); + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X > posY) { + + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a02 + 0); + b[ 9] = *(a02 + 1); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a03 + 0); + b[ 17] = *(a03 + 1); + b[ 18] = *(a03 + 2); + b[ 19] = *(a03 + 3); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a04 + 0); + b[ 25] = *(a04 + 1); + b[ 26] = *(a04 + 2); + b[ 27] = *(a04 + 3); + b[ 28] = *(a04 + 4); + b[ 29] = *(a04 + 5); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; */ + b += 8 * i; + } else + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + b += 4; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } +#if 1 + } +#else + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b += 4; + } +#endif + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + a01 += 2; + } else { +#ifdef UNIT + if (X > posY) { +#endif + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#ifdef UNIT + } else { + b[ 0] = ONE; + b[ 1] = ZERO; + } +#endif + a01 += lda; + } + b += 2; + X ++; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_16.c b/kernel/generic/ztrsm_lncopy_16.c new file mode 100644 index 000000000..4fd72c13e --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_16.c @@ -0,0 +1,308 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + *(b + 16) = *(a9 + 0); + *(b + 17) = *(a9 + 1); + *(b + 18) = *(a10 + 0); + *(b + 19) = *(a10 + 1); + *(b + 20) = *(a11 + 0); + *(b + 21) = *(a11 + 1); + *(b + 22) = *(a12 + 0); + *(b + 23) = *(a12 + 1); + *(b + 24) = *(a13 + 0); + *(b + 25) = *(a13 + 1); + *(b + 26) = *(a14 + 0); + *(b + 27) = *(a14 + 1); + *(b + 28) = *(a15 + 0); + *(b + 29) = *(a15 + 1); + *(b + 30) = *(a16 + 0); + *(b + 31) = *(a16 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + a9 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_16.c b/kernel/generic/ztrsm_ltcopy_16.c new file mode 100644 index 000000000..e9aeae1ad --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_16.c @@ -0,0 +1,264 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1; + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 32; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + *(b + 16) = *(a1 + 16); + *(b + 17) = *(a1 + 17); + *(b + 18) = *(a1 + 18); + *(b + 19) = *(a1 + 19); + *(b + 20) = *(a1 + 20); + *(b + 21) = *(a1 + 21); + *(b + 22) = *(a1 + 22); + *(b + 23) = *(a1 + 23); + *(b + 24) = *(a1 + 24); + *(b + 25) = *(a1 + 25); + *(b + 26) = *(a1 + 26); + *(b + 27) = *(a1 + 27); + *(b + 28) = *(a1 + 28); + *(b + 29) = *(a1 + 29); + *(b + 30) = *(a1 + 30); + *(b + 31) = *(a1 + 31); + } + + b += 32; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_16.c b/kernel/generic/ztrsm_uncopy_16.c new file mode 100644 index 000000000..e84d96891 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_16.c @@ -0,0 +1,313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + *(b + 16) = *(a9 + 0); + *(b + 17) = *(a9 + 1); + *(b + 18) = *(a10 + 0); + *(b + 19) = *(a10 + 1); + *(b + 20) = *(a11 + 0); + *(b + 21) = *(a11 + 1); + *(b + 22) = *(a12 + 0); + *(b + 23) = *(a12 + 1); + *(b + 24) = *(a13 + 0); + *(b + 25) = *(a13 + 1); + *(b + 26) = *(a14 + 0); + *(b + 27) = *(a14 + 1); + *(b + 28) = *(a15 + 0); + *(b + 29) = *(a15 + 1); + *(b + 30) = *(a16 + 0); + *(b + 31) = *(a16 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + a9 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 1; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_16.c b/kernel/generic/ztrsm_utcopy_16.c new file mode 100644 index 000000000..efcea5c3f --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_16.c @@ -0,0 +1,261 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, data1, data2; + + lda *= 2; + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 32; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + *(b + 16) = *(a1 + 16); + *(b + 17) = *(a1 + 17); + *(b + 18) = *(a1 + 18); + *(b + 19) = *(a1 + 19); + *(b + 20) = *(a1 + 20); + *(b + 21) = *(a1 + 21); + *(b + 22) = *(a1 + 22); + *(b + 23) = *(a1 + 23); + *(b + 24) = *(a1 + 24); + *(b + 25) = *(a1 + 25); + *(b + 26) = *(a1 + 26); + *(b + 27) = *(a1 + 27); + *(b + 28) = *(a1 + 28); + *(b + 29) = *(a1 + 29); + *(b + 30) = *(a1 + 30); + *(b + 31) = *(a1 + 31); + } + + b += 32; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 17d15656a..ce9268b93 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -111,9 +111,13 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S -CGEMMKERNEL = cgemm_kernel_2x2_lsx.S -CGEMMONCOPY = cgemm_ncopy_2_lsx.S -CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMKERNEL = cgemm_kernel_16x4_lasx.S +CGEMMINCOPY = cgemm_ncopy_16_lasx.S +CGEMMITCOPY = cgemm_tcopy_16_lasx.S +CGEMMONCOPY = cgemm_ncopy_4_lasx.S +CGEMMOTCOPY = cgemm_tcopy_4_lasx.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/loongarch64/cgemm_kernel_16x4_lasx.S b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S new file mode 100644 index 000000000..249abe102 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S @@ -0,0 +1,3757 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LASX vectors */ +#define U0 $xr30 +#define U1 $xr31 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define VALPHAR $xr28 +#define VALPHAI $xr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVNMSUB +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVFMADD +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVFMADD +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVNMSUB +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + xvldrepl.w VALPHAR, $sp, 112 + xvldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8 + xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + + xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14 + xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16 + + xvand.v U4, D0, D0 + + xvpermi.q U4, D1, 0x02 //1 2 9 10 3 4 11 12 + xvpermi.q D1, D0, 0x31 //5 6 13 14 7 8 15 16 + + xvst U4, TD, 0x00 + xvst D1, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vand.v $vr8, $vr1, $vr1 + + vpermi.w $vr8, $vr0, 0x44 + vpermi.w $vr1, $vr0, 0xee + + vst $vr8, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 // a_offset + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d TD, TD, 0x10 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + xvld U0, S1, 0x00 + + xvst U0, TD, 0x00 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld $vr0, S1, 0x00 + + vst $vr0, TD, 0x00 + + addi.d S1, S1, 0x10 // aoffset1 + addi.d TD, TD, 0x10 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_16_lasx.S b/kernel/loongarch64/cgemm_tcopy_16_lasx.S new file mode 100644 index 000000000..7d9eb94c8 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_16_lasx.S @@ -0,0 +1,741 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +#define F8 $f8 +#define F9 $f9 +#define F10 $f10 +#define F11 $f11 +#define F12 $f12 +#define F13 $f13 +#define F14 $f14 +#define F15 $f15 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + + PROLOGUE + + addi.d $sp, $sp, -56 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x04 + srai.d T1, N, 0x03 + slli.d T0, T0, 0x04 + slli.d T1, T1, 0x03 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x02 + srai.d T1, N, 0x01 + slli.d T0, T0, 0x02 + slli.d T1, T1, 0x01 + mul.d P4, M, T0 + mul.d P5, M, T1 + slli.d P4, P4, 0x03 + slli.d P5, P5, 0x03 + add.d P4, DST, P4 + add.d P5, DST, P5 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x07 + beq ZERO, J, .L_M7 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x400 + + srai.d I, N, 0x04 + addi.d J, J, -1 + beq ZERO, I, .L_N15 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S5, 0x40 + xvld U3, S5, 0x60 + xvld U4, S6, 0x00 + xvld U5, S6, 0x20 + xvld U6, S6, 0x40 + xvld U7, S6, 0x60 + + xvst U0, P1, 0x200 + xvst U1, P1, 0x220 + xvst U2, P1, 0x240 + xvst U3, P1, 0x260 + xvst U4, P1, 0x280 + xvst U5, P1, 0x2A0 + xvst U6, P1, 0x2C0 + xvst U7, P1, 0x2E0 + + xvld U0, S7, 0x00 + xvld U1, S7, 0x20 + xvld U2, S7, 0x40 + xvld U3, S7, 0x60 + xvld U4, S8, 0x00 + xvld U5, S8, 0x20 + xvld U6, S8, 0x40 + xvld U7, S8, 0x60 + + xvst U0, P1, 0x300 + xvst U1, P1, 0x320 + xvst U2, P1, 0x340 + xvst U3, P1, 0x360 + xvst U4, P1, 0x380 + xvst U5, P1, 0x3A0 + xvst U6, P1, 0x3C0 + xvst U7, P1, 0x3E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d S5, S5, 0x80 + addi.d S6, S6, 0x80 + addi.d S7, S7, 0x80 + addi.d S8, S8, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 + +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U0, P2, 0x100 + xvst U1, P2, 0x120 + xvst U2, P2, 0x140 + xvst U3, P2, 0x160 + xvst U4, P2, 0x180 + xvst U5, P2, 0x1A0 + xvst U6, P2, 0x1C0 + xvst U7, P2, 0x1E0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d P2, P2, 0x200 + +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + xvst U4, P3, 0x80 + xvst U5, P3, 0xA0 + xvst U6, P3, 0xC0 + xvst U7, P3, 0xE0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P3, P3, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + xvpermi.q U4, U5, 0x02 + xvpermi.q U6, U7, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + xvst U4, P4, 0x40 + xvst U6, P4, 0x60 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P4, P4, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fld.s F8, S5, 0x00 + fld.s F9, S5, 0x04 + + fld.s F10, S6, 0x00 + fld.s F11, S6, 0x04 + + fld.s F12, S7, 0x00 + fld.s F13, S7, 0x04 + + fld.s F14, S8, 0x00 + fld.s F15, S8, 0x04 + + fst.s F0, P5, 0x00 + fst.s F1, P5, 0x04 + fst.s F2, P5, 0x08 + fst.s F3, P5, 0x0c + fst.s F4, P5, 0x10 + fst.s F5, P5, 0x14 + fst.s F6, P5, 0x18 + fst.s F7, P5, 0x1c + fst.s F8, P5, 0x20 + fst.s F9, P5, 0x24 + fst.s F10, P5, 0x28 + fst.s F11, P5, 0x2c + fst.s F12, P5, 0x30 + fst.s F13, P5, 0x34 + fst.s F14, P5, 0x38 + fst.s F15, P5, 0x3c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P5, P5, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x04 + beq ZERO, I, .L_4N15 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 + +.L_4N15: + andi I, N, 0x08 + beq ZERO, I, .L_4N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d P2, P2, 0x100 + +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P3, P3, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P4, P4, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + fst.d F2, P5, 0x10 + fst.d F3, P5, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P5, P5, 0x20 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x04 + beq ZERO, I, .L_2N15 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 + +.L_2N15: + andi I, N, 0x08 + beq ZERO, I, .L_2N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d P2, P2, 0x80 + +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P3, P3, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P4, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P4, P4, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P5, P5, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x04 + beq ZERO, I, .L_1N15 + +.L_1I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + addi.d S1, S1, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N15: + andi I, N, 0x08 + beq ZERO, I, .L_1N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + + addi.d S1, S1, 0x40 + addi.d P2, P2, 0x40 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + xvld U0, S1, 0x00 + + xvst U0, P3, 0x00 + + addi.d S1, S1, 0x20 + addi.d P3, P3, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x10 + addi.d P4, P4, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P5, 0x00 + + addi.d S1, S1, 0x08 + addi.d P5, P5, 0x08 + +.L_M0: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + addi.d $sp, $sp, 56 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_4_lasx.S b/kernel/loongarch64/cgemm_tcopy_4_lasx.S new file mode 100644 index 000000000..9ff8a35b8 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_4_lasx.S @@ -0,0 +1,306 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + vst $vr2, S9, 0x20 + vst $vr3, S9, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S9, S9, 0x40 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + fst.s F4, S10, 0x10 + fst.s F5, S10, 0x14 + fst.s F6, S10, 0x18 + fst.s F7, S10, 0x1c + + addi.d S10, S10, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S9, S9, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + + addi.d S10, S10, 0x10 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + xvld U0, S1, 0x00 + + xvst U0, S8, 0x00 + + addi.d S1, S1, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld $vr0, S1, 0x00 + + vst $vr0, S9, 0x00 + + addi.d S1, S1, 0x10 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/param.h b/param.h index 5d2e960a2..8bdc03380 100644 --- a/param.h +++ b/param.h @@ -2845,21 +2845,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 1 #else #define DGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_M 16 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 16 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 8 #endif #define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_M 8 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256 From bf2310442b7eda57ad8089878518b3f43733efaf Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 21 Feb 2024 13:26:28 -0600 Subject: [PATCH 679/718] Fix get_num_cores for AIX. --- getarch.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/getarch.c b/getarch.c index f879e6bbb..2b5459a5f 100644 --- a/getarch.c +++ b/getarch.c @@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #endif -#if defined(AIX) +#if defined(_AIX) +#include +#include #include #endif @@ -1870,11 +1872,13 @@ static int get_num_cores(void) { return count; -#elif defined(AIX) +#elif defined(_AIX) //returns the number of processors which are currently online count = sysconf(_SC_NPROCESSORS_ONLN); if (count <= 0) count = 2; - + + return count; + #else return 2; #endif From 9b24b3141985de115e6191b376b435117a32404a Mon Sep 17 00:00:00 2001 From: frjohnst Date: Wed, 21 Feb 2024 15:52:29 -0500 Subject: [PATCH 680/718] resolve second_ conflict which breaks xlf timef --- lapack-netlib/SRC/Makefile | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index de2242701..205a32d31 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -101,8 +101,10 @@ SCLAUX = la_constants.o \ slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ ssteqr.o ssterf.o slaisnan.o sisnan.o \ - slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o \ - ../INSTALL/second_$(TIMER).o + slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o +ifneq ($(F_COMPILER), IBM) +SCLAUX += ../INSTALL/second_$(TIMER).o +endif endif ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" @@ -124,7 +126,10 @@ DZLAUX = la_constants.o\ dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ dsteqr.o dsterf.o dlaisnan.o disnan.o \ dlartgp.o dlartgs.o ../INSTALL/droundup_lwork.o \ - ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o + ../INSTALL/dlamch.o +ifneq ($(F_COMPILER), IBM) +DZLAUX += ../INSTALL/dsecnd_$(TIMER).o +endif endif #ifeq ($(BUILD_SINGLE),1) From d51ffec3a20299532a1ef5a403df2760f7b5b653 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 22 Feb 2024 10:46:45 +0800 Subject: [PATCH 681/718] LoongArch64: Opt cgemv with LASX --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/cgemv_n_8_lasx.S | 383 ++++++++++++++++++++++++++ kernel/loongarch64/cgemv_t_8_lasx.S | 342 +++++++++++++++++++++++ kernel/loongarch64/loongarch64_asm.S | 240 ++++++++++++++++ 4 files changed, 968 insertions(+) create mode 100644 kernel/loongarch64/cgemv_n_8_lasx.S create mode 100644 kernel/loongarch64/cgemv_t_8_lasx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index ce9268b93..3b2ee6e55 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -121,6 +121,9 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMVNKERNEL = cgemv_n_8_lasx.S +CGEMVTKERNEL = cgemv_t_8_lasx.S + CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S new file mode 100644 index 000000000..b078e3227 --- /dev/null +++ b/kernel/loongarch64/cgemv_n_8_lasx.S @@ -0,0 +1,383 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2024/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define A0 $xr12 +#define A1 $xr13 +#define A2 $xr14 +#define A3 $xr15 +#define A4 $xr16 +#define A5 $xr17 +#define A6 $xr18 +#define A7 $xr19 +#define A8 $xr20 +#define A9 $xr21 +#define A10 $xr22 +#define A11 $xr23 +#define A12 $xr24 +#define A13 $xr25 +#define A14 $xr26 +#define A15 $xr27 +#define TMP0 $xr28 +#define TMP1 $xr29 +#define TMP2 $xr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro CLOAD_X_8 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ + X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 + GCOMPLEXMUL GXCONJ, \ + xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ + X1, X1, VALPHA, TMP0, TMP1, TMP2, \ + X2, X2, VALPHA, TMP0, TMP1, TMP2, \ + X3, X3, VALPHA, TMP0, TMP1, TMP2, \ + X4, X4, VALPHA, TMP0, TMP1, TMP2, \ + X5, X5, VALPHA, TMP0, TMP1, TMP2, \ + X6, X6, VALPHA, TMP0, TMP1, TMP2, \ + X7, X7, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_8_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X4, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X5, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X6, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X7, T0, 0x00 + + GCOMPLEXMUL GXCONJ, \ + xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ + X1, X1, VALPHA, TMP0, TMP1, TMP2, \ + X2, X2, VALPHA, TMP0, TMP1, TMP2, \ + X3, X3, VALPHA, TMP0, TMP1, TMP2, \ + X4, X4, VALPHA, TMP0, TMP1, TMP2, \ + X5, X5, VALPHA, TMP0, TMP1, TMP2, \ + X6, X6, VALPHA, TMP0, TMP1, TMP2, \ + X7, X7, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_Y_8 + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro CLOAD_Y_8_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + PTR_ALSL T0, INC_Y, Y, 2 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + PTR_ADD T0, T0, INC_Y + PTR_ADD T0, T0, INC_Y + fld.d $f18, T0, 0 + fldx.d $f19, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 +.endm + +.macro CSTORE_Y_8_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 + + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 3 +.endm + +.macro CGEMV_N_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2, \ + Y0, X4, A8, Y0, TMP0, TMP1, TMP2, Y1, X4, A9, Y1, TMP0, TMP1, TMP2, \ + Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \ + Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \ + Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2 +.endm + +.macro CSTORE_Y_8 + GST xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro CLOAD_X_1 + GLDREPL xv, d, X0, X, 0x00 + GCOMPLEXMUL GXCONJ, \ + xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro CGEMV_N_1x8 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ + $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, \ + Y0, X4, A8, Y0, TMP0, TMP1, TMP2, \ + Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \ + Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \ + Y0, X7, A14, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +.macro CGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + CLOAD_\X_8 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + CLOAD_\Y_8 + CGEMV_N_8x8 + CSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 3 + PTR_ADDI K, K, 8 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 7 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x8 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + PTR_ALSL X, INC_X, X, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 7 + beqz J, .L_END +.L_\XW\()_N_L1: + CLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x1 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + xvpackev.w $xr0, $xr1, $xr0 + xvreplve0.d VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cgemv_t_8_lasx.S b/kernel/loongarch64/cgemv_t_8_lasx.S new file mode 100644 index 000000000..94e4bd2eb --- /dev/null +++ b/kernel/loongarch64/cgemv_t_8_lasx.S @@ -0,0 +1,342 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2022/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define X1 $xr2 +#define A0 $xr3 +#define A1 $xr4 +#define A2 $xr5 +#define A3 $xr6 +#define A4 $xr7 +#define A5 $xr8 +#define A6 $xr9 +#define A7 $xr10 +#define A8 $xr11 +#define A9 $xr12 +#define A10 $xr13 +#define A11 $xr14 +#define A12 $xr15 +#define A13 $xr16 +#define A14 $xr17 +#define A15 $xr18 +#define TP0 $xr19 +#define TP1 $xr20 +#define TP2 $xr21 +#define TP3 $xr22 +#define TP4 $xr23 +#define TP5 $xr24 +#define TP6 $xr25 +#define TP7 $xr26 +#define TMP0 $xr27 +#define TMP1 $xr28 +#define TMP2 $xr29 +#define Y0 $xr3 +#define Y1 $xr4 +#define Y2 $xr5 +#define Y3 $xr6 +#define Y4 $xr7 +#define Y5 $xr8 +#define Y6 $xr9 +#define Y7 $xr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y8 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ + TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro CLOAD_X8 + GLD xv, , X0, X, 0x00, X1, X, 0x20 +.endm + +.macro CLOAD_X8_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 + PTR_ALSL T0, INC_X, X, 2 + fld.d $f2, T0, 0x00 + fldx.d $f3, T0, INC_X + PTR_ALSL T0, INC_X, T0, 1 + fld.d $f4, T0, 0x00 + fldx.d $f5, T0, INC_X + GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 +.endm + +.macro CGEMV_T_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ + TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ + TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2, \ + TP4, A8, X0, TP4, TMP0, TMP1, TMP2, TP4, A9, X1, TP4, TMP0, TMP1, TMP2, \ + TP5, A10, X0, TP5, TMP0, TMP1, TMP2, TP5, A11, X1, TP5, TMP0, TMP1, TMP2, \ + TP6, A12, X0, TP6, TMP0, TMP1, TMP2, TP6, A13, X1, TP6, TMP0, TMP1, TMP2, \ + TP7, A14, X0, TP7, TMP0, TMP1, TMP2, TP7, A15, X1, TP7, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_T_LASX XW:req, X8:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + ZERO_Y8 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + CLOAD_\X8 + CGEMV_T_8x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + // Accumulated + GCOMPLEXACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ + Y5, TP5, Y6, TP6, Y7, TP7 + andi I, M, 7 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 + fld.d $f15, PA4, 0x00 + fld.d $f16, PA5, 0x00 + fld.d $f17, PA6, 0x00 + fld.d $f18, PA7, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#else + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ + A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2, \ + A4, A12, X0, A4, TMP0, TMP1, TMP2, A5, A13, X0, A5, TMP0, TMP1, TMP2, \ + A6, A14, X0, A6, TMP0, TMP1, TMP2, A7, A15, X0, A7, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + PTR_ALSL PY1, INC_Y, Y, 2 + fld.d $f15, PY1, 0x00 + fldx.d $f16, PY1, INC_Y + PTR_ALSL PY2, INC_Y, PY1, 1 + fld.d $f17, PY2, 0x00 + fldx.d $f18, PY2, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ + A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2,\ + A12, VALPHA, A4, A12, TMP0, TMP1, TMP2, A13, VALPHA, A5, A13, TMP0, TMP1, TMP2,\ + A14, VALPHA, A6, A14, TMP0, TMP1, TMP2, A15, VALPHA, A7, A15, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + fst.d $f15, PY1, 0x00 + fstx.d $f16, PY1, INC_Y + fst.d $f17, PY2, 0x00 + fstx.d $f18, PY2, INC_Y + PTR_ALSL Y, INC_Y, Y, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 7 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M8 +.L_\XW\()_N_1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + fld.d $f3, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + fst.d $f3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + xvpackev.w $xr0, $xr1, $xr0 + xvreplve0.d VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + CGEMV_T_LASX GAP_0, X8 +.L_GAP_1: /* if (incx != 1) */ + CGEMV_T_LASX GAP_1, X8_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S index 694dcdaa9..fee46d63e 100644 --- a/kernel/loongarch64/loongarch64_asm.S +++ b/kernel/loongarch64/loongarch64_asm.S @@ -384,6 +384,246 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endif .endm +// +// GCOMPLEXACC: Complex accumulate the values of vector registers +// pre_op: xvf or vf, differentiate between LSX or LASX instruction +// suf_op: s or d, differentiate between single precision or double precision complex numbers +// Note: When "pre_op = xvf && suf_op = s", in will be modified. +// +.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvpermi.q \out, \in, 0x01 +.ifeqs "\suf_op", "s" + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.else + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "vf" +.ifeqs "\suf_op", "s" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifnb \more + GCOMPLEXACC \pre_op, \suf_op, \more +.endif +.endm + +// +// GCOMPLEXMUL: Complex multiplication, out = in0 * in1 +// xconj: default value 0. +// if !(xconj) +// out_r = in0_r * in1_r - in0_i * in1_i; +// out_i = in0_r * in1_i + in0_i * in1_r; +// else +// out_r = in0_r * in1_r + in0_i * in1_i; +// out_i = in0_r * in1_i - in0_i * in1_r; +// pre_op: xvf or vf, differentiate between LSX or LASX instruction +// suf_op: s or d, differentiate between single precision or double precision complex numbers +// +.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + xvpackev.w \tmp0, \in0, \in0 +.else + xvpackev.d \tmp0, \in0, \in0 +.endif +.else + vxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + vpackev.w \tmp0, \in0, \in0 +.else + vpackev.d \tmp0, \in0, \in0 +.endif +.endif + + \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 + +.ifeqs "\pre_op", "xvf" +.ifeqs "\suf_op", "s" +.ifeqs "\xconj", "0" + xvpackod.w \tmp1, \in0, \tmp1 +.else + xvpackod.w \tmp1, \tmp1, \in0 +.endif + xvshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\xconj", "0" + xvpackod.d \tmp1, \in0, \tmp1 +.else + xvpackod.d \tmp1, \tmp1, \in0 +.endif + xvshuf4i.d \tmp2, \in1, 0x0b +.endif +.else +.ifeqs "\suf_op", "s" +.ifeqs "\xconj", "0" + vpackod.w \tmp1, \in0, \tmp1 +.else + vpackod.w \tmp1, \tmp1, \in0 +.endif + vshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\xconj", "0" + vpackod.d \tmp1, \in0, \tmp1 +.else + vpackod.d \tmp1, \tmp1, \in0 +.endif + vshuf4i.d \tmp2, \in1, 0x0b +.endif +.endif + + \pre_op\()mul.\suf_op \out, \tmp0, \in1 + \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out + +.ifnb \more + GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more +.endif +.endm + +// +// GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2 +// xconj: default value 0 +// conj: default value 0 +// if !(CONJ) +// if !(XCONJ) +// out_r = in0_r * in1_r - in0_i * in1_i + in2_r; +// out_i = in0_r * in1_i + in0_i * in1_r + in2_i; +// else +// out_r = in0_r * in1_r + in0_i * in1_i + in2_r; +// out_i = in0_r * in1_i - in0_i * in1_r + in2_i; +// else +// if !(XCONJ) +// out_r = in0_r * in1_r + in0_i * in1_i + in2_r; +// out_i = in2_i - (in0_r * in1_i - in0_i * in1_r); +// else +// out_r = in0_r * in1_r - in0_i * in1_i + in2_r; +// out_i = in2_i - (in0_r * in1_i + in0_i * in1_r); +// pre_op: xvf or vf, differentiate between LSX or LASX instruction +// suf_op: s or d, differentiate between single precision or double precision complex numbers +// +.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + xvpackev.w \tmp0, \in0, \in0 +.else + xvpackev.d \tmp0, \in0, \in0 +.endif +.else + vxor.v \tmp1, \tmp1, \tmp1 +.ifeqs "\suf_op", "s" + vpackev.w \tmp0, \in0, \in0 +.else + vpackev.d \tmp0, \in0, \in0 +.endif +.endif + + \pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2 +.ifeqs "\conj", "1" + \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 +.ifeqs "\pre_op", "xvf" +.ifeqs "\suf_op", "s" + xvshuf4i.w \tmp0, \tmp0, 0xb1 + xvpackev.w \out, \tmp0, \tmp2 +.else + xvshuf4i.d \tmp0, \tmp0, 0x0b + xvpackev.d \out, \tmp0, \tmp2 +.endif +.else +.ifeqs "\suf_op", "s" + vshuf4i.w \tmp0, \tmp0, 0xb1 + vpackev.w \out, \tmp0, \tmp2 +.else + vshuf4i.d \tmp0, \tmp0, 0x0b + vpackev.d \out, \tmp0, \tmp2 +.endif +.endif /* pre_op = xvf */ +.else + \pre_op\()add.\suf_op \out, \tmp2, \tmp1 +.endif /* conj = 1 */ + + \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 + +.ifeqs "\pre_op", "xvf" +.ifeqs "\suf_op", "s" +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + xvpackod.w \tmp1, \in0, \tmp1 +.else + xvpackod.w \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + xvpackod.w \tmp1, \in0, \in0 +.else + xvpackod.w \tmp1, \tmp1, \tmp1 +.endif +.endif + xvshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + xvpackod.d \tmp1, \in0, \tmp1 +.else + xvpackod.d \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + xvpackod.d \tmp1, \in0, \in0 +.else + xvpackod.d \tmp1, \tmp1, \tmp1 +.endif +.endif + xvshuf4i.d \tmp2, \in1, 0x0b +.endif +.else +.ifeqs "\suf_op", "s" +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + vpackod.w \tmp1, \in0, \tmp1 +.else + vpackod.w \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + vpackod.w \tmp1, \in0, \in0 +.else + vpackod.w \tmp1, \tmp1, \tmp1 +.endif +.endif + vshuf4i.w \tmp2, \in1, 0xb1 +.else +.ifeqs "\conj", "0" +.ifeqs "\xconj", "0" + vpackod.d \tmp1, \in0, \tmp1 +.else + vpackod.d \tmp1, \tmp1, \in0 +.endif +.else +.ifeqs "\xconj", "0" + vpackod.d \tmp1, \in0, \in0 +.else + vpackod.d \tmp1, \tmp1, \tmp1 +.endif +.endif + vshuf4i.d \tmp2, \in1, 0x0b +.endif +.endif + + \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out + +.ifnb \more + GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more +.endif +.endm + // // Media Related Macros // From 990507e3b8f833de26da1b2ab6dff0972ae65c3f Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 22 Feb 2024 11:41:15 +0800 Subject: [PATCH 682/718] LoongArch64: Opt zgemv with LASX --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 3 + kernel/loongarch64/zgemv_n_4_lasx.S | 343 ++++++++++++++++++++++++++ kernel/loongarch64/zgemv_t_4_lasx.S | 299 ++++++++++++++++++++++ 3 files changed, 645 insertions(+) create mode 100644 kernel/loongarch64/zgemv_n_4_lasx.S create mode 100644 kernel/loongarch64/zgemv_t_4_lasx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 3b2ee6e55..9b55d1bbb 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -139,6 +139,9 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMVNKERNEL = zgemv_n_4_lasx.S +ZGEMVTKERNEL = zgemv_t_4_lasx.S + ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c diff --git a/kernel/loongarch64/zgemv_n_4_lasx.S b/kernel/loongarch64/zgemv_n_4_lasx.S new file mode 100644 index 000000000..98b1a6f7d --- /dev/null +++ b/kernel/loongarch64/zgemv_n_4_lasx.S @@ -0,0 +1,343 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2024/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M16 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define A0 $xr12 +#define A1 $xr13 +#define A2 $xr14 +#define A3 $xr15 +#define A4 $xr16 +#define A5 $xr17 +#define A6 $xr18 +#define A7 $xr19 +#define A8 $xr20 +#define A9 $xr21 +#define A10 $xr22 +#define A11 $xr23 +#define A12 $xr24 +#define A13 $xr25 +#define A14 $xr26 +#define A15 $xr27 +#define TMP0 $xr28 +#define TMP1 $xr29 +#define TMP2 $xr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro ZLOAD_X_4 + GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30 + GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0 + GCOMPLEXMUL GXCONJ, \ + xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ + X1, X1, VALPHA, TMP0, TMP1, TMP2, \ + X2, X2, VALPHA, TMP0, TMP1, TMP2, \ + X3, X3, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_X_4_GAP + xvld X0, X, 0 + xvpermi.q X0, X0, 0 + + PTR_ADD T0, X, INC_X + xvld X1, T0, 0 + xvpermi.q X1, X1, 0 + + PTR_ADD T0, T0, INC_X + xvld X2, T0, 0 + xvpermi.q X2, X2, 0 + + PTR_ADD T0, T0, INC_X + xvld X3, T0, 0 + xvpermi.q X3, X3, 0 + + GCOMPLEXMUL GXCONJ, \ + xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ + X1, X1, VALPHA, TMP0, TMP1, TMP2, \ + X2, X2, VALPHA, TMP0, TMP1, TMP2, \ + X3, X3, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_Y_4 + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro ZLOAD_Y_4_GAP + vld $vr10, Y, 0 + vldx $vr13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + vld $vr11, T0, 0 + vldx $vr14, T0, INC_Y + GPERMI xv, q, Y0, A1, 0x02, Y1, A2, 0x02 +.endm + +.macro ZGEMV_N_4x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 +.endm + +.macro ZSTORE_Y_4 + GST xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro ZSTORE_Y_4_GAP + xvstelm.d Y0, Y, 0, 0 + xvstelm.d Y0, Y, 0x08, 1 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 2 + xvstelm.d Y0, T0, 0x08, 3 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 0 + xvstelm.d Y1, T0, 0x08, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 2 + xvstelm.d Y1, T0, 0x08, 3 +.endm + +.macro ZLOAD_Y_1 + vld $vr10, Y, 0 +.endm + +.macro ZGEMV_N_1x4 + GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0, $vr16, PA2, 0, $vr18, PA3, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZSTORE_Y_1 + vst $vr10, Y, 0 +.endm + +.macro ZLOAD_X_1 + GLD xv, , X0, X, 0x00 + GPERMI xv, q, X0, X0, 0 + GCOMPLEXMUL GXCONJ, \ + xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_1x1 + GLD_INC v, , 0x10, $vr12, PA0, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_LASX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L4: + ZLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + ZLOAD_\Y_4 + ZGEMV_N_4x4 + ZSTORE_\Y_4 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x4 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END +.L_\XW\()_N_L1: + ZLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x1 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M16 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + xvpackev.d $xr0, $xr1, $xr0 + xvreplve0.q VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + ZGEMV_N_LASX GAP_0_0, X_4, X_1, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + ZGEMV_N_LASX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + ZGEMV_N_LASX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE + diff --git a/kernel/loongarch64/zgemv_t_4_lasx.S b/kernel/loongarch64/zgemv_t_4_lasx.S new file mode 100644 index 000000000..4d33b8f96 --- /dev/null +++ b/kernel/loongarch64/zgemv_t_4_lasx.S @@ -0,0 +1,299 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2024/02/20 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M16 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define X1 $xr2 +#define A0 $xr3 +#define A1 $xr4 +#define A2 $xr5 +#define A3 $xr6 +#define A4 $xr7 +#define A5 $xr8 +#define A6 $xr9 +#define A7 $xr10 +#define A8 $xr11 +#define A9 $xr12 +#define A10 $xr13 +#define A11 $xr14 +#define A12 $xr15 +#define A13 $xr16 +#define A14 $xr17 +#define A15 $xr18 +#define TP0 $xr19 +#define TP1 $xr20 +#define TP2 $xr21 +#define TP3 $xr22 +#define TP4 $xr23 +#define TP5 $xr24 +#define TP6 $xr25 +#define TP7 $xr26 +#define TMP0 $xr27 +#define TMP1 $xr28 +#define TMP2 $xr29 +#define Y0 $xr3 +#define Y1 $xr4 +#define Y2 $xr5 +#define Y3 $xr6 +#define Y4 $xr7 +#define Y5 $xr8 +#define Y6 $xr9 +#define Y7 $xr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y4 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro ZLOAD_X4 + GLD xv, , X0, X, 0x00, X1, X, 0x20 +.endm + +.macro ZLOAD_X4_GAP + xvld X0, X, 0 + + PTR_ADD T0, X, INC_X + xvld A0, T0, 0 + xvpermi.q X0, A0, 0x02 + + PTR_ADD T0, T0, INC_X + xvld X1, T0, 0 + + PTR_ADD T0, T0, INC_X + xvld A0, T0, 0 + xvpermi.q X1, A0, 0x02 +.endm + +.macro ZGEMV_T_4x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ + TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ + TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_T_LASX XW:req, X4:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L4: + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + ZLOAD_\X4 + ZGEMV_T_4x4 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 2 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + // Accumulated + GCOMPLEXACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + GLD xv, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00, A10, PA2, 0x00, A11, PA3, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 +#else + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ + A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + xvld A8, Y, 0x00 + xvldx A9, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + xvld A10, PY0, 0x00 + xvldx A11, PY0, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ + A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + vst $vr11, Y, 0x00 + vstx $vr12, Y, INC_Y + vst $vr13, PY0, 0x00 + vstx $vr14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M16 +.L_\XW\()_N_1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + GLD xv, , A0, PA0, 0x00, X0, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x10 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + xvld A0, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + xvf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + vst $vr3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + xvpackev.d $xr0, $xr1, $xr0 + xvreplve0.q VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + ZGEMV_T_LASX GAP_0, X4 +.L_GAP_1: /* if (incx != 1) */ + ZGEMV_T_LASX GAP_1, X4_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE From 892f8ff3e55e24fda9af3f6364319cce3f60116b Mon Sep 17 00:00:00 2001 From: Ayappan Perumal Date: Thu, 22 Feb 2024 07:05:37 -0600 Subject: [PATCH 683/718] Shared library support for AIX --- Makefile | 3 +++ Makefile.system | 4 ---- exports/Makefile | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index b344abcd2..19bab2915 100644 --- a/Makefile +++ b/Makefile @@ -152,6 +152,9 @@ endif ifeq ($(OSNAME), CYGWIN_NT) @$(MAKE) -C exports dll endif +ifeq ($(OSNAME), AIX) + @$(MAKE) -C exports so +endif endif tests : shared diff --git a/Makefile.system b/Makefile.system index 0088eaff5..f7ccc7746 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1707,11 +1707,7 @@ endif LIBDLLNAME = $(LIBPREFIX).dll IMPLIBNAME = lib$(LIBNAMEBASE).dll.a -ifneq ($(OSNAME), AIX) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) -else -LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a) -endif LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) diff --git a/exports/Makefile b/exports/Makefile index 7682f851d..cf948ccb2 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -73,6 +73,10 @@ endif endif endif +ifeq ($(F_COMPILER)$(OSNAME), IBMAIX) +EXTRALIB += -lxlf90 +endif + ifeq ($(C_COMPILER), PGI) EXTRALIB += -pgf90libs endif @@ -248,6 +252,20 @@ endif ifeq ($(OSNAME), AIX) +so : ../$(LIBSONAME) linktest.c + $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK. + rm -f linktest + +../$(LIBSONAME) : aix.exp + $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB) + +aix.exp : + /usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \ + || ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \ + { if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \ + /usr/bin/sort -u > aix.exp + ifeq ($(COMPILER_F77), xlf) goto32.$(SUFFIX) : ../$(LIBNAME) aix.def From 82b81c0bbee5d1344dcd2fb0f50cba6a7c0ded9f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:11:50 +0100 Subject: [PATCH 684/718] Dont fail if there is no Fortran compiler --- cmake/f_check.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index df3a4858d..4c4f5ac04 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -64,6 +64,7 @@ else () "#define NEEDBUNDERSCORE 1\n") endif() +if (CMAKE_Fortran_COMPILER) get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE) string(TOUPPER ${F_COMPILER} F_COMPILER) - +endif() From 8fc2c2db043eda5e415c1673779c8bdcd91870fd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:14:13 +0100 Subject: [PATCH 685/718] Fix missing support for INTERFACE64 on ARM64 and MIPS64 --- cmake/fc.cmake | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 5c30be843..bc85a2921 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -6,9 +6,6 @@ if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") # This is for classic Flang. LLVM Flang is handled with gfortran below. set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") - if (BINARY64 AND INTERFACE64) - set(FCOMMON_OPT "${FCOMMON_OPT} -i8") - endif () if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") endif () @@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F if (MIPS64) if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () else () set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") endif () @@ -83,6 +83,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () endif () endif () + if (ARM64 AND INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () else () if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") From 3516fff378cf5d9153d18c86d9a117b0976e777d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:15:28 +0100 Subject: [PATCH 686/718] Avoid linking both libgomp and libomp in mixed clang/gfortran builds --- ctest/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 91338b73b..6e0a7f309 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -40,6 +40,10 @@ else() c_${float_char}blas1.c) endif() target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat1 omp pthread) + endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat1 m) endif() @@ -65,6 +69,10 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat2 omp pthread) + endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat2 m) endif() @@ -90,6 +98,10 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat3 omp pthread) + endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() From 4adfe4d53185233bfbeeb362734a2d80814d4457 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:16:01 +0100 Subject: [PATCH 687/718] Avoid linking both libgomp and libomp in mixed clang/gfortran builds --- test/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d68b12d87..ace20dffc 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -21,10 +21,14 @@ endif() if (BUILD_COMPLEX16) list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) endif() - +message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID}) foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) +if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") +target_link_libraries(${test_bin} omp pthread) +endif() endforeach() # $1 exec, $2 input, $3 output_result From ca121eb5eda1c635f1353d261de5783e079c21c1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:17:05 +0100 Subject: [PATCH 688/718] Avoid linking both libgomp and libomp in mixed clang/gfortran builds --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index e7236677a..b69417853 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -107,6 +107,10 @@ set(ZDMDEIGTST zchkdmd.f90) macro(add_eig_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) +if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") +target_link_libraries(${name} omp pthread) +endif() #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() From be20588a3c7c0725cd38846ce3408604fbfb2c95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:17:48 +0100 Subject: [PATCH 689/718] Avoid linking both libgomp and libomp in mixed clang/gfortran builds --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 143fd0597..9ae0cea79 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -240,6 +240,10 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(${name} omp pthread) + endif() #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() From 16b488cabe8c6113ba41bf140f4dfe3cace6b9ac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:38:05 +0100 Subject: [PATCH 690/718] CI: Add various Apple M1 build configurations to gh workflow --- .github/workflows/apple_m.yml | 149 ++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 .github/workflows/apple_m.yml diff --git a/.github/workflows/apple_m.yml b/.github/workflows/apple_m.yml new file mode 100644 index 000000000..e34eada86 --- /dev/null +++ b/.github/workflows/apple_m.yml @@ -0,0 +1,149 @@ +name: apple m + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: macos-14 + + strategy: + fail-fast: false + matrix: + build: [cmake, make] + fortran: [gfortran] + openmp: [0, 1] + ilp64: [0, 1] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + elif [ "$RUNNER_OS" == "macOS" ]; then + sysctl -a | grep machdep.cpu + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get install -y gfortran cmake ccache libtinfo5 + elif [ "$RUNNER_OS" == "macOS" ]; then + # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. + brew reinstall gcc + brew install coreutils cmake ccache + brew install llvm + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + # GNU make and cmake call the compilers differently. It looks like + # that causes the cache to mismatch. Keep the ccache for both build + # tools separate to avoid polluting each other. + key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }} + # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler. + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }} + ccache-${{ runner.os }}-${{ matrix.build }} + + - name: Configure ccache + run: | + if [ "${{ matrix.build }}" = "make" ]; then + # Add ccache to path + if [ "$RUNNER_OS" = "Linux" ]; then + echo "/usr/lib/ccache" >> $GITHUB_PATH + elif [ "$RUNNER_OS" = "macOS" ]; then + echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH + echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH + echo "" >>$GITHUB_PATH + else + echo "::error::$RUNNER_OS not supported" + exit 1 + fi + fi + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Build OpenBLAS + run: | + export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" + export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + export CC="/opt/homebrew/opt/llvm/bin/clang" + case "${{ matrix.build }}" in + "make") + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}" + ;; + "cmake") + export LDFLAGS="$LDFLAGS -Wl,-ld_classic" + mkdir build && cd build + cmake -DDYNAMIC_ARCH=1 \ + -DUSE_OPENMP=${{matrix.openmp}} \ + -DINTERFACE64=${{matrix.ilp64}} \ + -DNOFORTRAN=0 \ + -DBUILD_WITHOUT_LAPACK=0 \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ + .. + cmake --build . + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac + + - name: Show ccache status + continue-on-error: true + run: ccache -s + + - name: Run tests + timeout-minutes: 60 + run: | + case "${{ matrix.build }}" in + "make") + MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0' + echo "::group::Tests in 'test' directory" + make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + echo "::group::Tests in 'ctest' directory" + make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + echo "::group::Tests in 'utest' directory" + make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}" + echo "::endgroup::" + ;; + "cmake") + cd build && ctest + ;; + *) + echo "::error::Configuration not supported" + exit 1 + ;; + esac From 5b953f2f8d3a3f138e4be4ed28624ee25826c968 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Feb 2024 22:41:08 +0100 Subject: [PATCH 691/718] Disable most AppleM1 builds (replaced by gh workflows) --- .cirrus.yml | 70 ++++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index b4c4870d0..9a898f421 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -1,44 +1,44 @@ macos_instance: image: ghcr.io/cirruslabs/macos-monterey-xcode:latest -task: - name: AppleM1/LLVM - compile_script: - - brew install llvm - - export PATH=/opt/homebrew/opt/llvm/bin:$PATH - - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - make TARGET=VORTEX USE_OPENMP=1 CC=clang +#task: +# name: AppleM1/LLVM +# compile_script: +# - brew install llvm +# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" +# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" +# - make TARGET=VORTEX USE_OPENMP=1 CC=clang -task: - name: AppleM1/LLVM/ILP64 - compile_script: - - brew install llvm - - export PATH=/opt/homebrew/opt/llvm/bin:$PATH - - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 +#task: +# name: AppleM1/LLVM/ILP64 +# compile_script: +# - brew install llvm +# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" +# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" +# - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 -task: - name: AppleM1/LLVM/CMAKE - compile_script: - - brew install llvm - - export PATH=/opt/homebrew/opt/llvm/bin:$PATH - - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - mkdir build - - cd build - - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. - - make -j 4 +#task: +# name: AppleM1/LLVM/CMAKE +# compile_script: +# - brew install llvm +# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" +# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" +# - mkdir build +# - cd build +# - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. +# - make -j 4 -task: - name: AppleM1/GCC/MAKE/OPENMP - compile_script: - - brew install gcc@11 - - export PATH=/opt/homebrew/bin:$PATH - - export LDFLAGS="-L/opt/homebrew/lib" - - export CPPFLAGS="-I/opt/homebrew/include" - - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 +#task: +# name: AppleM1/GCC/MAKE/OPENMP +# compile_script: +# - brew install gcc@11 +# - export PATH=/opt/homebrew/bin:$PATH +# - export LDFLAGS="-L/opt/homebrew/lib" +# - export CPPFLAGS="-I/opt/homebrew/include" +# - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 macos_instance: image: ghcr.io/cirruslabs/macos-monterey-xcode:latest From bdaa6705ca7b0031223417402e3f87e85fef86ac Mon Sep 17 00:00:00 2001 From: frjohnst Date: Fri, 23 Feb 2024 10:20:48 -0500 Subject: [PATCH 692/718] fix conlict between PR 4515 and AIX shared obj support --- exports/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 15f65aefe..27a291f34 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -315,6 +315,11 @@ test : linktest.c linktest.c : $(GENSYM) ../Makefile.system ../getarch.c ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c +ifeq ($(F_COMPILER), IBM) + mv linktest.c linktest.c.FIRST + egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c + rm linktest.c.FIRST +endif clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed From be5e18c6f94716c6265f9e2cd58517fa138abcf5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 24 Feb 2024 23:55:43 +0100 Subject: [PATCH 693/718] Add kernel definitions for CSUM and ZSUM --- kernel/x86_64/KERNEL | 2 ++ kernel/x86_64/KERNEL.SKYLAKEX | 2 ++ 2 files changed, 4 insertions(+) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index f8278c3b4..ec4290e82 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -489,5 +489,7 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S SSUMKERNEL = ../arm/sum.c DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = zsum_sse.S +ZSUMKERNEL = zsum_sse2.S SOMATCOPY_RT = omatcopy_rt.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 548e5dcfc..7e946ef2e 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -46,3 +46,5 @@ ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c +CSUMKERNEL = csum.c +ZSUMKERNEL = zsum.c From 8f8ef3492a3eb753f8e6e1201dfe4c7107c5d779 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 24 Feb 2024 23:57:50 +0100 Subject: [PATCH 694/718] Add CSUM and ZSUM kernels (trivially derived from their existing ASUM counterparts) --- kernel/x86_64/csum.c | 131 +++++++++++ kernel/x86_64/csum_microk_skylakex-2.c | 289 ++++++++++++++++++++++++ kernel/x86_64/zsum.c | 131 +++++++++++ kernel/x86_64/zsum_microk_skylakex-2.c | 280 +++++++++++++++++++++++ kernel/x86_64/zsum_sse.S | 299 +++++++++++++++++++++++++ kernel/x86_64/zsum_sse2.S | 283 +++++++++++++++++++++++ 6 files changed, 1413 insertions(+) create mode 100644 kernel/x86_64/csum.c create mode 100644 kernel/x86_64/csum_microk_skylakex-2.c create mode 100644 kernel/x86_64/zsum.c create mode 100644 kernel/x86_64/zsum_microk_skylakex-2.c create mode 100644 kernel/x86_64/zsum_sse.S create mode 100644 kernel/x86_64/zsum_sse2.S diff --git a/kernel/x86_64/csum.c b/kernel/x86_64/csum.c new file mode 100644 index 000000000..e85b5cae1 --- /dev/null +++ b/kernel/x86_64/csum.c @@ -0,0 +1,131 @@ +#include "common.h" + +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) +#include "csum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_CSUM_KERNEL +static FLOAT csum_kernel(BLASLONG n, FLOAT *x) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x1 = x; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + sum0 += x1[0]; + sum1 += x1[1]; + sum2 += x1[2]; + sum3 += x1[3]; + + sum0 += x1[4]; + sum1 += x1[5]; + sum2 += x1[6]; + sum3 += x1[7]; + + x1+=8; + i+=4; + } + + while (i < n) { + sum4 += (x1[0] + x1[1]); + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = csum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int sum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = sum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = sum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = sum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/csum_microk_skylakex-2.c b/kernel/x86_64/csum_microk_skylakex-2.c new file mode 100644 index 000000000..ec882efa1 --- /dev/null +++ b/kernel/x86_64/csum_microk_skylakex-2.c @@ -0,0 +1,289 @@ +/* need a new enough GCC for avx512 support */ +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) + +#define HAVE_CASUM_KERNEL 1 + +#include + +#include + +static FLOAT casum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + if (n2 < 64) { + __m128 accum_10, accum_11, accum_12, accum_13; + + accum_10 = _mm_setzero_ps(); + accum_11 = _mm_setzero_ps(); + accum_12 = _mm_setzero_ps(); + accum_13 = _mm_setzero_ps(); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + + if (n2 >= 32){ + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + _mm_prefetch(&x1[16], _MM_HINT_T0); + __m128 x04 = _mm_loadu_ps(&x1[16]); + __m128 x05 = _mm_loadu_ps(&x1[20]); + __m128 x06 = _mm_loadu_ps(&x1[24]); + __m128 x07 = _mm_loadu_ps(&x1[28]); + + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + accum_10 = _mm_add_ps(accum_10, x04); + accum_11 = _mm_add_ps(accum_11, x05); + accum_12 = _mm_add_ps(accum_12, x06); + accum_13 = _mm_add_ps(accum_13, x07); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + accum_10 = _mm_add_ps(accum_10, x00); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + sumf += (x1[0] + x1[1]); + } + + accum_10 = _mm_add_ps(accum_10, accum_11); + accum_12 = _mm_add_ps(accum_12, accum_13); + accum_10 = _mm_add_ps(accum_10, accum_12); + + accum_10 = _mm_hadd_ps(accum_10, accum_10); + accum_10 = _mm_hadd_ps(accum_10, accum_10); + + sumf += accum_10[0]; + } + else { + __m512 accum_0, accum_1, accum_2, accum_3; + __m512 x00, x01, x02, x03, x04, x05, x06, x07; + + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + accum_2 = _mm512_setzero_ps(); + accum_3 = _mm512_setzero_ps(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 256) { + if (n2 >= 128) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[ 16]); + x02 = _mm512_loadu_ps(&x1[ 32]); + x03 = _mm512_loadu_ps(&x1[ 48]); + x04 = _mm512_loadu_ps(&x1[ 64]); + x05 = _mm512_loadu_ps(&x1[ 80]); + x06 = _mm512_loadu_ps(&x1[ 96]); + x07 = _mm512_loadu_ps(&x1[112]); + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + n2 -= 128; + x1 += 128; + } + + if (n2 >= 64) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + x02 = _mm512_loadu_ps(&x1[32]); + x03 = _mm512_loadu_ps(&x1[48]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_ps(&x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + } + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + + sumf = _mm512_reduce_add_ps(accum_0); + } + // n2 >= 256, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; + + if (0 != align_header) { + uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[ 16]); + x02 = _mm512_load_ps(&x1[ 32]); + x03 = _mm512_load_ps(&x1[ 48]); + x04 = _mm512_load_ps(&x1[ 64]); + x05 = _mm512_load_ps(&x1[ 80]); + x06 = _mm512_load_ps(&x1[ 96]); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + + while (n2 >= 128) { + + accum_0 = _mm512_add_ps(accum_0, x00); + x00 = _mm512_load_ps(&x1[ 0]); + accum_1 = _mm512_add_ps(accum_1, x01); + x01 = _mm512_load_ps(&x1[ 16]); + accum_2 = _mm512_add_ps(accum_2, x02); + x02 = _mm512_load_ps(&x1[ 32]); + accum_3 = _mm512_add_ps(accum_3, x03); + x03 = _mm512_load_ps(&x1[ 48]); + + accum_0 = _mm512_add_ps(accum_0, x04); + x04 = _mm512_load_ps(&x1[ 64]); + accum_1 = _mm512_add_ps(accum_1, x05); + x05 = _mm512_load_ps(&x1[ 80]); + accum_2 = _mm512_add_ps(accum_2, x06); + x06 = _mm512_load_ps(&x1[ 96]); + accum_3 = _mm512_add_ps(accum_3, x07); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + } + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + if (n2 >= 64) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + x02 = _mm512_load_ps(&x1[32]); + x03 = _mm512_load_ps(&x1[48]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_ps(&x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + accum_0 = _mm512_add_ps(accum_0, x00); + } + + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + sumf = _mm512_reduce_add_ps(accum_0); + } + } + + return sumf; +} +#endif +#endif diff --git a/kernel/x86_64/zsum.c b/kernel/x86_64/zsum.c new file mode 100644 index 000000000..5973c1253 --- /dev/null +++ b/kernel/x86_64/zsum.c @@ -0,0 +1,131 @@ +#include "common.h" + +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) +#include "zsum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_ZASUM_KERNEL +static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x1 = x; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + sum0 += x1[0]; + sum1 += x1[1]; + sum2 += x1[2]; + sum3 += x1[3]; + + sum0 += x1[4]; + sum1 += x1[5]; + sum2 += x1[6]; + sum3 += x1[7]; + + x1+=8; + i+=4; + } + + while (i < n) { + sum4 += x1[0] + x1[1]; + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = zsum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += x[ip] + x[ip + 1]; + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int sum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = sum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = sum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = sum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/zsum_microk_skylakex-2.c b/kernel/x86_64/zsum_microk_skylakex-2.c new file mode 100644 index 000000000..0bca7ce6d --- /dev/null +++ b/kernel/x86_64/zsum_microk_skylakex-2.c @@ -0,0 +1,280 @@ +/* need a new enough GCC for avx512 support */ +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203)) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203)) + +#define HAVE_ZSUM_KERNEL 1 + +#include + +#include + +static FLOAT zsum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + + if (n2 < 32) { + __m128d accum_10, accum_11, accum_12, accum_13; + + accum_10 = _mm_setzero_pd(); + accum_11 = _mm_setzero_pd(); + accum_12 = _mm_setzero_pd(); + accum_13 = _mm_setzero_pd(); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + if (n2 >= 16){ + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + _mm_prefetch(&x1[8], _MM_HINT_T0); + __m128d x04 = _mm_loadu_pd(&x1[ 8]); + __m128d x05 = _mm_loadu_pd(&x1[10]); + __m128d x06 = _mm_loadu_pd(&x1[12]); + __m128d x07 = _mm_loadu_pd(&x1[14]); + + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + accum_10 = _mm_add_pd(accum_10, x04); + accum_11 = _mm_add_pd(accum_11, x05); + accum_12 = _mm_add_pd(accum_12, x06); + accum_13 = _mm_add_pd(accum_13, x07); + + x1 += 16; + n2 -= 16; + } + + if (n2 >= 8) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + accum_10 = _mm_add_pd(accum_10, x00); + } + + accum_10 = _mm_add_pd(accum_10, accum_11); + accum_12 = _mm_add_pd(accum_12, accum_13); + accum_10 = _mm_add_pd(accum_10, accum_12); + + accum_10 = _mm_hadd_pd(accum_10, accum_10); + + sumf = accum_10[0]; + } + else { + __m512d accum_0, accum_1, accum_2, accum_3; + __m512d x00, x01, x02, x03, x04, x05, x06, x07; + __m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff); + + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + accum_2 = _mm512_setzero_pd(); + accum_3 = _mm512_setzero_pd(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 128) { + if (n2 >= 64) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + x04 = _mm512_loadu_pd(&x1[32]); + x05 = _mm512_loadu_pd(&x1[40]); + x06 = _mm512_loadu_pd(&x1[48]); + x07 = _mm512_loadu_pd(&x1[56]); + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_loadu_pd(&x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + } + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + // n2 >= 128, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; + + if (0 != align_header) { + unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + x04 = _mm512_load_pd(&x1[32]); + x05 = _mm512_load_pd(&x1[40]); + x06 = _mm512_load_pd(&x1[48]); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + + while (n2 >= 64) { + accum_0 = _mm512_add_pd(accum_0, x00); + x00 = _mm512_load_pd(&x1[ 0]); + accum_1 = _mm512_add_pd(accum_1, x01); + x01 = _mm512_load_pd(&x1[ 8]); + accum_2 = _mm512_add_pd(accum_2, x02); + x02 = _mm512_load_pd(&x1[16]); + accum_3 = _mm512_add_pd(accum_3, x03); + x03 = _mm512_load_pd(&x1[24]); + + accum_0 = _mm512_add_pd(accum_0, x04); + x04 = _mm512_load_pd(&x1[32]); + accum_1 = _mm512_add_pd(accum_1, x05); + x05 = _mm512_load_pd(&x1[40]); + accum_2 = _mm512_add_pd(accum_2, x06); + x06 = _mm512_load_pd(&x1[48]); + accum_3 = _mm512_add_pd(accum_3, x07); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + } + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + if (n2 >= 32) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_load_pd(&x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + accum_0 = _mm512_add_pd(accum_0, x00); + } + + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + } + + return sumf; +} +#endif +#endif diff --git a/kernel/x86_64/zsum_sse.S b/kernel/x86_64/zsum_sse.S new file mode 100644 index 000000000..b679b42b0 --- /dev/null +++ b/kernel/x86_64/zsum_sse.S @@ -0,0 +1,299 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + salq $ZBASE_SHIFT, INCX + + cmpq $2 * SIZE, INCX + jne .L100 + + subq $-32 * SIZE, X + addq M, M + + cmpq $3, M + jle .L18 + + testq $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + addq $SIZE, X + decq M + jle .L998 + ALIGN_3 + +.L05: + testq $8, X + je .L10 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + addq $2 * SIZE, X + subq $2, M + jle .L998 + ALIGN_3 + +.L10: + movq M, I + sarq $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + movaps -16 * SIZE(X), %xmm8 + movaps -12 * SIZE(X), %xmm9 + movaps -8 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + decq I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + addps %xmm8, %xmm0 + movaps 16 * SIZE(X), %xmm8 + + addps %xmm9, %xmm1 + movaps 20 * SIZE(X), %xmm9 + + addps %xmm10, %xmm2 + movaps 24 * SIZE(X), %xmm10 + + addps %xmm11, %xmm3 + movaps 28 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + decq I + jg .L11 + ALIGN_3 + +.L12: + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + addps %xmm6, %xmm2 + addps %xmm7, %xmm3 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm1 + + addps %xmm10, %xmm2 + addps %xmm11, %xmm3 + + addq $32 * SIZE, X + ALIGN_3 + +.L14: + testq $31, M + jle .L998 + +.L15: + testq $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm4 + addps %xmm4, %xmm0 + + movaps -20 * SIZE(X), %xmm5 + addps %xmm5, %xmm1 + + addq $16 * SIZE, X + ALIGN_3 + +.L16: + testq $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L17: + testq $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm6 + addps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L18: + testq $2, M + je .L19 + +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd -32 * SIZE(X), %xmm7 + addps %xmm7, %xmm3 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, M + je .L998 + + movss -32 * SIZE(X), %xmm6 + addps %xmm6, %xmm2 + jmp .L998 + ALIGN_4 + +.L100: + movq M, I + sarq $2, I + jle .L105 + ALIGN_4 + +.L101: + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + + addps %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + + addps %xmm5, %xmm1 + + decq I + jg .L101 + ALIGN_4 + +.L105: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andq $3, M + jle .L998 + ALIGN_4 + +.L106: + movsd (X), %xmm4 + addps %xmm4, %xmm0 + addq INCX, X + decq M + jg .L106 + ALIGN_4 + +.L998: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zsum_sse2.S b/kernel/x86_64/zsum_sse2.S new file mode 100644 index 000000000..6f667164d --- /dev/null +++ b/kernel/x86_64/zsum_sse2.S @@ -0,0 +1,283 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + salq $ZBASE_SHIFT, INCX + + cmpq $2 * SIZE, INCX + jne .L40 + + subq $-16 * SIZE, X + addq M, M + + testq $SIZE, X + je .L05 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -16 * SIZE(X), %xmm0 + addq $SIZE, X + + subq $1, M + jle .L999 + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + addpd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + addpd %xmm9, %xmm1 + movaps 10 * SIZE(X), %xmm9 + + addpd %xmm10, %xmm2 + movaps 12 * SIZE(X), %xmm10 + + addpd %xmm11, %xmm3 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + ALIGN_3 + +.L20: + andq $15, M + jle .L998 + + testq $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L21: + testq $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L22: + testq $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm6 + addpd %xmm6, %xmm3 + addq $2 * SIZE, X + +.L23: + testq $1, M + je .L998 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + prefetcht0 PREFETCHSIZE * SIZE(X) +#endif + +#ifdef PENTIUM4 + prefetchnta PREFETCHSIZE * SIZE(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + addq INCX, X + addpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + addpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm6 + addq INCX, X + addpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + addpd %xmm7, %xmm3 + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + addpd %xmm4, %xmm0 + addq INCX, X + decq M + jg .L61 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE From 12787775d92c700d9f5956ea35cbcb0576e4ab2a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Feb 2024 17:55:36 +0100 Subject: [PATCH 695/718] add csum/zsum kernels (trivially derived from the asum ones)s) --- kernel/arm64/KERNEL.ARMV8SVE | 3 + kernel/arm64/csum_thunderx2t99.c | 247 +++++++++++++++++++++++++++++++ kernel/arm64/zsum_thunderx2t99.c | 244 ++++++++++++++++++++++++++++++ 3 files changed, 494 insertions(+) create mode 100644 kernel/arm64/csum_thunderx2t99.c create mode 100644 kernel/arm64/zsum_thunderx2t99.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index ccbce27e1..eeb4844bf 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -1,3 +1,6 @@ + +CSUMKERNEL = csum_thunderx2t99.c +ZSUMKERNEL = zsum_thunderx2t99.c SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c diff --git a/kernel/arm64/csum_thunderx2t99.c b/kernel/arm64/csum_thunderx2t99.c new file mode 100644 index 000000000..874f4eb5a --- /dev/null +++ b/kernel/arm64/csum_thunderx2t99.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include + +#define N "x0" /* vector length */ +#define X "x1" /* "X" vector address */ +#define INC_X "x2" /* "X" stride */ +#define J "x5" /* loop variable */ + +#define REG0 "wzr" +#define SUMF "s0" +#define SUMFD "d0" + +/******************************************************************************/ + +#define KERNEL_F1 \ + "ldr d1, ["X"] \n" \ + "add "X", "X", #8 \n" \ + "ext v2.8b, v1.8b, v1.8b, #4 \n" \ + "fadd s1, s1, s2 \n" \ + "fadd "SUMF", "SUMF", s1 \n" + +#define KERNEL_F32 \ + "ldr q16, ["X"] \n" \ + "ldr q17, ["X", #16] \n" \ + "ldr q18, ["X", #32] \n" \ + "ldr q19, ["X", #48] \n" \ + "ldp q20, q21, ["X", #64] \n" \ + "ldp q22, q23, ["X", #96] \n" \ + "ldp q24, q25, ["X", #128] \n" \ + "ldp q26, q27, ["X", #160] \n" \ + "fadd v16.4s, v16.4s, v17.4s \n" \ + "fadd v18.4s, v18.4s, v19.4s \n" \ + "ldp q28, q29, ["X", #192] \n" \ + "ldp q30, q31, ["X", #224] \n" \ + "add "X", "X", #256 \n" \ + "fadd v20.4s, v20.4s, v21.4s \n" \ + "fadd v22.4s, v22.4s, v23.4s \n" \ + "PRFM PLDL1KEEP, ["X", #1024] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ + "fadd v24.4s, v24.4s, v25.4s \n" \ + "fadd v26.4s, v26.4s, v27.4s \n" \ + "fadd v0.4s, v0.4s, v16.4s \n" \ + "fadd v1.4s, v1.4s, v18.4s \n" \ + "fadd v2.4s, v2.4s, v20.4s \n" \ + "fadd v3.4s, v3.4s, v22.4s \n" \ + "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ + "fadd v28.4s, v28.4s, v29.4s \n" \ + "fadd v30.4s, v30.4s, v31.4s \n" \ + "fadd v4.4s, v4.4s, v24.4s \n" \ + "fadd v5.4s, v5.4s, v26.4s \n" \ + "fadd v6.4s, v6.4s, v28.4s \n" \ + "fadd v7.4s, v7.4s, v30.4s \n" + +#define KERNEL_F32_FINALIZE \ + "fadd v0.4s, v0.4s, v1.4s \n" \ + "fadd v2.4s, v2.4s, v3.4s \n" \ + "fadd v4.4s, v4.4s, v5.4s \n" \ + "fadd v6.4s, v6.4s, v7.4s \n" \ + "fadd v0.4s, v0.4s, v2.4s \n" \ + "fadd v4.4s, v4.4s, v6.4s \n" \ + "fadd v0.4s, v0.4s, v4.4s \n" \ + "ext v1.16b, v0.16b, v0.16b, #8 \n" \ + "fadd v0.2s, v0.2s, v1.2s \n" \ + "faddp "SUMF", v0.2s \n" + +#define INIT_S \ + "lsl "INC_X", "INC_X", #3 \n" + +#define KERNEL_S1 \ + "ldr d1, ["X"] \n" \ + "add "X", "X", "INC_X" \n" \ + "ext v2.8b, v1.8b, v1.8b, #4 \n" \ + "fadd s1, s1, s2 \n" \ + "fadd "SUMF", "SUMF", s1 \n" + + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + + +static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asum = 0.0 ; + + if ( n < 0 ) return(asum); + + __asm__ __volatile__ ( + " mov "N", %[N_] \n" + " mov "X", %[X_] \n" + " mov "INC_X", %[INCX_] \n" + " fmov "SUMF", "REG0" \n" + " fmov s1, "REG0" \n" + " fmov s2, "REG0" \n" + " fmov s3, "REG0" \n" + " fmov s4, "REG0" \n" + " fmov s5, "REG0" \n" + " fmov s6, "REG0" \n" + " fmov s7, "REG0" \n" + " cmp "N", xzr \n" + " ble 9f //asum_kernel_L999 \n" + " cmp "INC_X", xzr \n" + " ble 9f //asum_kernel_L999 \n" + " cmp "INC_X", #1 \n" + " bne 5f //asum_kernel_S_BEGIN \n" + + "1: //asum_kernel_F_BEGIN: \n" + " asr "J", "N", #5 \n" + " cmp "J", xzr \n" + " beq 3f //asum_kernel_F1 \n" + + "2: //asum_kernel_F32: \n" + " "KERNEL_F32" \n" + " subs "J", "J", #1 \n" + " bne 2b //asum_kernel_F32 \n" + " "KERNEL_F32_FINALIZE" \n" + + "3: //asum_kernel_F1: \n" + " ands "J", "N", #31 \n" + " ble 9f //asum_kernel_L999 \n" + + "4: //asum_kernel_F10: \n" + " "KERNEL_F1" \n" + " subs "J", "J", #1 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" + + "5: //asum_kernel_S_BEGIN: \n" + " "INIT_S" \n" + " asr "J", "N", #2 \n" + " cmp "J", xzr \n" + " ble 7f //asum_kernel_S1 \n" + + "6: //asum_kernel_S4: \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne 6b //asum_kernel_S4 \n" + + "7: //asum_kernel_S1: \n" + " ands "J", "N", #3 \n" + " ble 9f //asum_kernel_L999 \n" + + "8: //asum_kernel_S10: \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne 8b //asum_kernel_S10 \n" + + "9: //asum_kernel_L999: \n" + " fmov %[ASUM_], "SUMFD" \n" + + : [ASUM_] "=r" (asum) //%0 + : [N_] "r" (n), //%1 + [X_] "r" (x), //%2 + [INCX_] "r" (inc_x) //%3 + : "cc", + "memory", + "x0", "x1", "x2", "x3", "x4", "x5", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + ); + + return asum; +} + +#if defined(SMP) +static int casum_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + *result = casum_compute(n, x, inc_x); + + return 0; +} +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + FLOAT asum = 0.0; + +#if defined(SMP) + if (inc_x == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + asum = casum_compute(n, x, inc_x); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + FLOAT *ptr; + + mode = BLAS_SINGLE | BLAS_COMPLEX; + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, NULL, 0, result, 0, + ( void *)casum_thread_function, nthreads); + + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + asum = asum + (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + asum = casum_compute(n, x, inc_x); +#endif + + return asum; +} diff --git a/kernel/arm64/zsum_thunderx2t99.c b/kernel/arm64/zsum_thunderx2t99.c new file mode 100644 index 000000000..087dae2fe --- /dev/null +++ b/kernel/arm64/zsum_thunderx2t99.c @@ -0,0 +1,244 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include + +#define N "x0" /* vector length */ +#define X "x1" /* "X" vector address */ +#define INC_X "x2" /* "X" stride */ +#define J "x5" /* loop variable */ + +#define REG0 "xzr" +#define SUMF "d0" +#define TMPF "d1" + +/******************************************************************************/ + +#define KERNEL_F1 \ + "ldr q1, ["X"] \n" \ + "add "X", "X", #16 \n" \ + "faddp d1, v1.2d \n" \ + "fadd "SUMF", "SUMF", d1 \n" + +#define KERNEL_F16 \ + "ldr q16, ["X"] \n" \ + "ldr q17, ["X", #16] \n" \ + "ldr q18, ["X", #32] \n" \ + "ldr q19, ["X", #48] \n" \ + "ldp q20, q21, ["X", #64] \n" \ + "ldp q22, q23, ["X", #96] \n" \ + "ldp q24, q25, ["X", #128] \n" \ + "ldp q26, q27, ["X", #160] \n" \ + "fadd v16.2d, v16.2d, v17.2d \n" \ + "fadd v18.2d, v18.2d, v19.2d \n" \ + "ldp q28, q29, ["X", #192] \n" \ + "ldp q30, q31, ["X", #224] \n" \ + "add "X", "X", #256 \n" \ + "fadd v20.2d, v20.2d, v21.2d \n" \ + "fadd v22.2d, v22.2d, v23.2d \n" \ + "PRFM PLDL1KEEP, ["X", #1024] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+64] \n" \ + "fadd v24.2d, v24.2d, v25.2d \n" \ + "fadd v26.2d, v26.2d, v27.2d \n" \ + "fadd v28.2d, v28.2d, v29.2d \n" \ + "fadd v30.2d, v30.2d, v31.2d \n" \ + "fadd v0.2d, v0.2d, v16.2d \n" \ + "fadd v1.2d, v1.2d, v18.2d \n" \ + "fadd v2.2d, v2.2d, v20.2d \n" \ + "fadd v3.2d, v3.2d, v22.2d \n" \ + "PRFM PLDL1KEEP, ["X", #1024+128] \n" \ + "PRFM PLDL1KEEP, ["X", #1024+192] \n" \ + "fadd v4.2d, v4.2d, v24.2d \n" \ + "fadd v5.2d, v5.2d, v26.2d \n" \ + "fadd v6.2d, v6.2d, v28.2d \n" \ + "fadd v7.2d, v7.2d, v30.2d \n" + +#define KERNEL_F16_FINALIZE \ + "fadd v0.2d, v0.2d, v1.2d \n" \ + "fadd v2.2d, v2.2d, v3.2d \n" \ + "fadd v4.2d, v4.2d, v5.2d \n" \ + "fadd v6.2d, v6.2d, v7.2d \n" \ + "fadd v0.2d, v0.2d, v2.2d \n" \ + "fadd v4.2d, v4.2d, v6.2d \n" \ + "fadd v0.2d, v0.2d, v4.2d \n" \ + "faddp "SUMF", v0.2d \n" + +#define INIT_S \ + "lsl "INC_X", "INC_X", #4 \n" + +#define KERNEL_S1 \ + "ldr q1, ["X"] \n" \ + "add "X", "X", "INC_X" \n" \ + "faddp d1, v1.2d \n" \ + "fadd "SUMF", "SUMF", d1 \n" + + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + + +static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + FLOAT asum = 0.0 ; + + if ( n < 0 ) return(asum); + + __asm__ __volatile__ ( + " mov "N", %[N_] \n" + " mov "X", %[X_] \n" + " mov "INC_X", %[INCX_] \n" + " fmov "SUMF", "REG0" \n" + " fmov d1, "REG0" \n" + " fmov d2, "REG0" \n" + " fmov d3, "REG0" \n" + " fmov d4, "REG0" \n" + " fmov d5, "REG0" \n" + " fmov d6, "REG0" \n" + " fmov d7, "REG0" \n" + " cmp "N", xzr \n" + " ble 9f //asum_kernel_L999 \n" + " cmp "INC_X", xzr \n" + " ble 9f //asum_kernel_L999 \n" + " cmp "INC_X", #1 \n" + " bne 5f //asum_kernel_S_BEGIN \n" + + "1: //asum_kernel_F_BEGIN: \n" + " asr "J", "N", #4 \n" + " cmp "J", xzr \n" + " beq 3f //asum_kernel_F1 \n" + + ".align 5 \n" + "2: //asum_kernel_F16: \n" + " "KERNEL_F16" \n" + " subs "J", "J", #1 \n" + " bne 2b //asum_kernel_F16 \n" + " "KERNEL_F16_FINALIZE" \n" + + "3: //asum_kernel_F1: \n" + " ands "J", "N", #15 \n" + " ble 9f //asum_kernel_L999 \n" + + "4: //asum_kernel_F10: \n" + " "KERNEL_F1" \n" + " subs "J", "J", #1 \n" + " bne 4b //asum_kernel_F10 \n" + " b 9f //asum_kernel_L999 \n" + + "5: //asum_kernel_S_BEGIN: \n" + " "INIT_S" \n" + " asr "J", "N", #2 \n" + " cmp "J", xzr \n" + " ble 7f //asum_kernel_S1 \n" + + "6: //asum_kernel_S4: \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne 6b //asum_kernel_S4 \n" + + "7: //asum_kernel_S1: \n" + " ands "J", "N", #3 \n" + " ble 9f //asum_kernel_L999 \n" + + "8: //asum_kernel_S10: \n" + " "KERNEL_S1" \n" + " subs "J", "J", #1 \n" + " bne 8b //asum_kernel_S10 \n" + + "9: //asum_kernel_L999: \n" + " fmov %[ASUM_], "SUMF" \n" + + : [ASUM_] "=r" (asum) //%0 + : [N_] "r" (n), //%1 + [X_] "r" (x), //%2 + [INCX_] "r" (inc_x) //%3 + : "cc", + "memory", + "x0", "x1", "x2", "x3", "x4", "x5", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + ); + + return asum; +} + +#if defined(SMP) +static int zasum_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + *result = zasum_compute(n, x, inc_x); + + return 0; +} +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + FLOAT asum = 0.0; + +#if defined(SMP) + if (inc_x == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + asum = zasum_compute(n, x, inc_x); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + FLOAT *ptr; + + mode = BLAS_DOUBLE | BLAS_COMPLEX; + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, NULL, 0, result, 0, + ( void *)zasum_thread_function, nthreads); + + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + asum = asum + (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + asum = zasum_compute(n, x, inc_x); +#endif + + return asum; +} From 7d506984fa2c1d1f8dd981969d84788d087907f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Feb 2024 17:57:11 +0100 Subject: [PATCH 696/718] fix assignment of default CSUM kernel --- kernel/arm64/KERNEL | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index f936cdf47..7d7e648c4 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -1,3 +1,5 @@ +CSUMKERNEL=csum.S + ifndef SNRM2KERNEL SNRM2KERNEL = ../arm/nrm2.c endif From baf88564bcf857f088839158d07ec05380ed4c7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Feb 2024 19:23:41 +0100 Subject: [PATCH 697/718] Fix potential buffer overflow --- interface/gemmt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index 018deb7fb..cae00877e 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -522,7 +522,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_START; - buffer_size = j + k + 128 / sizeof(FLOAT); + buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); #endif @@ -611,7 +611,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif IDEBUG_START; - buffer_size = j + k + 128 / sizeof(FLOAT); + buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); #endif From 0ce794f0c363754e0a9c7c3b2d997e9fe7465c12 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Feb 2024 23:38:36 +0100 Subject: [PATCH 698/718] Enable GEMM3M tests on supported platforms --- ctest/CMakeLists.txt | 46 +++++++++++++++++++++++++++++++++++++++++++- ctest/Makefile | 36 ++++++++++++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 6e0a7f309..d7baadee4 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -10,6 +10,11 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") endif() +set (USE_GEMM3M 0) +if (${ARCH} MATCHES x86|x86_64|ia64|mips) + set(USE_GEMM3M 1) +endif () + if(WIN32) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 "$ErrorActionPreference = \"Stop\"\n" @@ -88,6 +93,17 @@ if (NOT NOFORTRAN) auxiliary.c c_xerbla.c constant.c) + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_executable(x${float_char}cblat3_3m + c_${float_char}blat3_3m.f + c_${float_char}blas3_3m.c + c_${float_char}3chke_3m.c + auxiliary.c + c_xerbla.c + constant.c) + endif() + endif() else() add_executable(x${float_char}cblat3 c_${float_char}blat3c.c @@ -96,6 +112,17 @@ else() auxiliary.c c_xerbla.c constant.c) + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_executable(x${float_char}cblat3_3m + c_${float_char}blat3c_3m.c + c_${float_char}blas3_3m.c + c_${float_char}3chke_3m.c + auxiliary.c + c_xerbla.c + constant.c) + endif() + endif() endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) @@ -105,7 +132,24 @@ endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) + if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) + string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") + target_link_libraries(x${float_char}cblat3 omp pthread) + endif() + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") + target_link_libraries(x${float_char}cblat3_3m m) + endif() + endif() + endif() add_test(NAME "x${float_char}cblat3" COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") - + if (USE_GEMM3M) + if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) + add_test(NAME "x${float_char}cblat3_3m" + COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m") + endif() + endif() endforeach() diff --git a/ctest/Makefile b/ctest/Makefile index ad960b35a..36682c7b6 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -5,6 +5,24 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system +SUPPORT_GEMM3M = 0 + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + override CFLAGS += -DADD$(BU) -DCBLAS ifeq ($(F_COMPILER),GFORTRAN) override FFLAGS += -fno-tree-vectorize @@ -43,7 +61,7 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o -all :: all1 all2 all3 +all :: all1 all2 all3 all3_3m ifeq ($(BUILD_SINGLE),1) all1targets += xscblat1 @@ -182,8 +200,9 @@ endif endif all3_3m: xzcblat3_3m xccblat3_3m +ifeq ($(SUPPORT_GEMM3M),1) ifeq ($(USE_OPENMP), 1) -ifeq ($(BUILD_SINGLE),1) +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m endif ifeq ($(BUILD_COMPLEX16),1) @@ -197,6 +216,7 @@ ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif endif +endif @@ -271,8 +291,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(SUPPORT_GEMM3M),1) xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif else xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) @@ -280,6 +302,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) +ifeq ($(SUPPORT_GEMM3M),1) +xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME) + $(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif endif endif @@ -293,8 +319,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(SUPPORT_GEMM3M),1) xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif else xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) @@ -302,6 +330,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME) $(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) +ifeq ($(SUPPORT_GEMM3M),1) +xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME) + $(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif endif endif From ba201c1939fbb2e68ccb384709ee29e9e4559158 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Feb 2024 23:39:24 +0100 Subject: [PATCH 699/718] Enable GEMM3M tests on supported platforms --- test/CMakeLists.txt | 19 ++++++++++++++++++- test/Makefile | 27 +++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ace20dffc..b4bf36cee 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -21,7 +21,18 @@ endif() if (BUILD_COMPLEX16) list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) endif() -message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID}) + +set (USE_GEMM3M 0) +if (${ARCH} MATCHES x86|x86_64|ia64|mips) + set(USE_GEMM3M 1) + if (BUILD_COMPLEX) + list (APPEND OpenBLAS_Tests cblat3_3m) + endif () + if (BUILD_COMPLEX16) + list (APPEND OpenBLAS_Tests zblat3_3m) + endif () +endif () + foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) @@ -82,4 +93,10 @@ add_test(NAME "${float_type}blas2" COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) add_test(NAME "${float_type}blas3" COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) +if (USE_GEMM3M) +if ((${float_type} STREQUAL "c") OR (${float_type} STREQUAL "z")) +add_test(NAME "${float_type}blas3_3m" + COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3_3m.dat" ${float_type_upper}BLAT3_3M.SUMM) +endif() +endif() endforeach() diff --git a/test/Makefile b/test/Makefile index 5a4694ce6..6a50b6c98 100644 --- a/test/Makefile +++ b/test/Makefile @@ -4,6 +4,24 @@ ifeq ($(F_COMPILER),GFORTRAN) override FFLAGS += -fno-tree-vectorize endif +SUPPORT_GEMM3M = 0 + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + ifeq ($(NOFORTRAN),1) all :: else @@ -153,11 +171,20 @@ ifeq ($(BUILD_DOUBLE),1) D3=dblat3 endif ifeq ($(BUILD_COMPLEX),1) +ifeq ($(SUPPORT_GEMM3M),1) +C3=cblat3 cblat3_3m +else C3=cblat3 endif +endif ifeq ($(BUILD_COMPLEX16),1) +ifeq ($(SUPPORT_GEMM3M),1) +Z3=zblat3 zblat3_3m +else Z3=zblat3 endif +endif + level3: $(B3) $(S3) $(D3) $(C3) $(Z3) From 87dd1c710e5bec07f72b0e30d8c81db3446c456d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 07:37:30 +0100 Subject: [PATCH 700/718] fix conditional gemm3m build --- ctest/Makefile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index 36682c7b6..3a3ee5611 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -61,7 +61,7 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o -all :: all1 all2 all3 all3_3m +all :: all1 all2 all3 ifeq ($(BUILD_SINGLE),1) all1targets += xscblat1 @@ -162,9 +162,15 @@ all3targets += xdcblat3 endif ifeq ($(BUILD_COMPLEX),1) all3targets += xccblat3 +ifeq ($(USE_GEMM3M),1) +all3targets += xccblat3_3m +endif endif ifeq ($(BUILD_COMPLEX16),1) all3targets += xzcblat3 +ifeq ($(USE_GEMM3M),1) +all3targets += xzcblat3_3m +endif endif all3: $(all3targets) @@ -199,7 +205,6 @@ endif endif endif -all3_3m: xzcblat3_3m xccblat3_3m ifeq ($(SUPPORT_GEMM3M),1) ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_COMPLEX),1) From 8dea25ffffca03dee25db035e7197f83402b117f Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 26 Feb 2024 02:04:37 -0500 Subject: [PATCH 701/718] LoongArch64: Fixed utest kernel_regress:skx_avx --- kernel/loongarch64/rot_lasx.S | 2 +- kernel/loongarch64/rot_lsx.S | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/loongarch64/rot_lasx.S b/kernel/loongarch64/rot_lasx.S index 5d7e3d7cc..71378e0b2 100644 --- a/kernel/loongarch64/rot_lasx.S +++ b/kernel/loongarch64/rot_lasx.S @@ -1036,7 +1036,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY xvfmul.d VT0, VX1, VXC xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX0, VXS + xvfmul.d VT1, VX1, VXS xvfmsub.d VT1, VX3, VXC, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S index 4b0e59310..3bb77aaec 100644 --- a/kernel/loongarch64/rot_lsx.S +++ b/kernel/loongarch64/rot_lsx.S @@ -1142,7 +1142,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE vinsgr2vr.d VX0, t1, 0 vinsgr2vr.d VX0, t2, 1 - add.d X, X, INCX ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -1199,7 +1198,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY VMUL VT0, VX1, VXC VFMADD VT0, VX3, VXS, VT0 - VMUL VT1, VX0, VXS + VMUL VT1, VX1, VXS VMSUB VT1, VX3, VXC, VT1 vstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX @@ -1223,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY VMUL VT0, VX1, VXC VFMADD VT0, VX3, VXS, VT0 - VMUL VT1, VX0, VXS + VMUL VT1, VX1, VXS VMSUB VT1, VX3, VXC, VT1 vstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX @@ -1296,7 +1295,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY VMUL VT0, VX1, VXC VFMADD VT0, VX3, VXS, VT0 - VMUL VT1, VX0, VXS + VMUL VT1, VX1, VXS VMSUB VT1, VX3, VXC, VT1 vstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX From 5aaeca2896464a21e8c7f435bb28b5a44378c49e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 09:26:14 +0100 Subject: [PATCH 702/718] fix name --- ctest/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index 3a3ee5611..bbaf96f8e 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -162,13 +162,13 @@ all3targets += xdcblat3 endif ifeq ($(BUILD_COMPLEX),1) all3targets += xccblat3 -ifeq ($(USE_GEMM3M),1) +ifeq ($(SUPPORT_GEMM3M),1) all3targets += xccblat3_3m endif endif ifeq ($(BUILD_COMPLEX16),1) all3targets += xzcblat3 -ifeq ($(USE_GEMM3M),1) +ifeq ($(SUPPORT_GEMM3M),1) all3targets += xzcblat3_3m endif endif From ea167328f112bddc82eb5ab8ab9c330b4e210c36 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 14:14:58 +0100 Subject: [PATCH 703/718] Add f2c-converted sources for GEMM3M tests --- ctest/c_cblat3c_3m.c | 4854 +++++++++++++++++++++++++++++++++++++++++ ctest/c_zblat3c_3m.c | 4897 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 9751 insertions(+) create mode 100644 ctest/c_cblat3c_3m.c create mode 100644 ctest/c_zblat3c_3m.c diff --git a/ctest/c_cblat3c_3m.c b/ctest/c_cblat3c_3m.c new file mode 100644 index 000000000..9cfa26a41 --- /dev/null +++ b/ctest/c_cblat3c_3m.c @@ -0,0 +1,4854 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0; + if (trace) { + o__1.oerr = 0; + o__1.ounit = ntra; + o__1.ofnmlen = 32; + o__1.ofnm = snaps; + o__1.orl = 0; + o__1.osta = 0; + o__1.oacc = 0; + o__1.ofm = 0; + o__1.oblnk = 0; + f_open(&o__1); + } +/* Read the flag that directs rewinding of the snapshot file. */ + s_rsle(&io___7); + do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical)); + e_rsle(); + rewi = rewi && trace; +/* Read the flag that directs stopping on any failure. */ + s_rsle(&io___9); + do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical)); + e_rsle(); +/* Read the flag that indicates whether error exits are to be tested. */ + s_rsle(&io___11); + do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical)); + e_rsle(); +/* Read the flag that indicates whether row-major data layout to be tested. */ + s_rsle(&io___13); + do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer)); + e_rsle(); +/* Read the threshold value of the test ratio */ + s_rsle(&io___15); + do_lio(&c__4, &c__1, (char *)&thresh, (ftnlen)sizeof(real)); + e_rsle(); + +/* Read and check the parameter values for the tests. */ + +/* Values of N */ + s_rsle(&io___17); + do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer)); + e_rsle(); + if (nidim < 1 || nidim > 9) { + s_wsfe(&io___19); + do_fio(&c__1, "N", (ftnlen)1); + do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___20); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); + } + e_rsle(); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { + s_wsfe(&io___23); + do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } +/* L10: */ + } +/* Values of ALPHA */ + s_rsle(&io___24); + do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer)); + e_rsle(); + if (nalf < 1 || nalf > 7) { + s_wsfe(&io___26); + do_fio(&c__1, "ALPHA", (ftnlen)5); + do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___27); + i__1 = nalf; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex)); + } + e_rsle(); +/* Values of BETA */ + s_rsle(&io___29); + do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer)); + e_rsle(); + if (nbet < 1 || nbet > 7) { + s_wsfe(&io___31); + do_fio(&c__1, "BETA", (ftnlen)4); + do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___32); + i__1 = nbet; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__6, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof(complex)); + } + e_rsle(); + +/* Report values of parameters. */ + + s_wsfe(&io___34); + e_wsfe(); + s_wsfe(&io___35); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); + } + e_wsfe(); + s_wsfe(&io___36); + i__1 = nalf; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(real)); + } + e_wsfe(); + s_wsfe(&io___37); + i__1 = nbet; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(real)); + } + e_wsfe(); + if (! tsterr) { + s_wsle(&io___38); + e_wsle(); + s_wsfe(&io___39); + e_wsfe(); + } + s_wsle(&io___40); + e_wsle(); + s_wsfe(&io___41); + do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(real)); + e_wsfe(); + s_wsle(&io___42); + e_wsle(); + rorder = FALSE_; + corder = FALSE_; + if (layout == 2) { + rorder = TRUE_; + corder = TRUE_; + s_wsfe(&io___45); + e_wsfe(); + } else if (layout == 1) { + rorder = TRUE_; + s_wsfe(&io___46); + e_wsfe(); + } else if (layout == 0) { + corder = TRUE_; + s_wsfe(&io___47); + e_wsfe(); + } + s_wsle(&io___48); + e_wsle(); + +/* Read names of subroutines and flags which indicate */ +/* whether they are to be tested. */ + + for (i__ = 1; i__ <= 9; ++i__) { + ltest[i__ - 1] = FALSE_; +/* L20: */ + } +L30: + i__1 = s_rsfe(&io___50); + if (i__1 != 0) { + goto L60; + } + i__1 = do_fio(&c__1, snamet, (ftnlen)13); + if (i__1 != 0) { + goto L60; + } + i__1 = do_fio(&c__1, (char *)<estt, (ftnlen)sizeof(logical)); + if (i__1 != 0) { + goto L60; + } + i__1 = e_rsfe(); + if (i__1 != 0) { + goto L60; + } + for (i__ = 1; i__ <= 9; ++i__) { + if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == + 0) { + goto L50; + } +/* L40: */ + } + s_wsfe(&io___53); + do_fio(&c__1, snamet, (ftnlen)13); + e_wsfe(); + s_stop("", (ftnlen)0); +L50: + ltest[i__ - 1] = ltestt; + goto L30; + +L60: + cl__1.cerr = 0; + cl__1.cunit = 5; + cl__1.csta = 0; + f_clos(&cl__1); + +/* Compute EPS (the machine precision). */ + + eps = 1.f; +L70: + r__1 = eps + 1.f; + if (sdiff_(&r__1, &c_b91) == 0.f) { + goto L80; + } + eps *= .5f; + goto L70; +L80: + eps += eps; + s_wsfe(&io___55); + do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(real)); + e_wsfe(); + +/* Check the reliability of CMMCH using exact data. */ + + n = 32; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * 65 - 66; +/* Computing MAX */ + i__5 = i__ - j + 1; + i__4 = f2cmax(i__5,0); + ab[i__3].r = (real) i__4, ab[i__3].i = 0.f; +/* L90: */ + } + i__2 = j + 4224; + ab[i__2].r = (real) j, ab[i__2].i = 0.f; + i__2 = (j + 65) * 65 - 65; + ab[i__2].r = (real) j, ab[i__2].i = 0.f; + i__2 = j - 1; + c__[i__2].r = 0.f, c__[i__2].i = 0.f; +/* L100: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j - 1; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (real) i__3, cc[i__2].i = 0.f; +/* L110: */ + } +/* CC holds the exact result. On exit from CMMCH CT holds */ +/* the result computed by CMMCH. */ + *(unsigned char *)transa = 'N'; + *(unsigned char *)transb = 'N'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + s_wsfe(&io___68); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + *(unsigned char *)transb = 'C'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + s_wsfe(&io___69); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j + 4224; + i__3 = n - j + 1; + ab[i__2].r = (real) i__3, ab[i__2].i = 0.f; + i__2 = (j + 65) * 65 - 65; + i__3 = n - j + 1; + ab[i__2].r = (real) i__3, ab[i__2].i = 0.f; +/* L120: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n - j; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (real) i__3, cc[i__2].i = 0.f; +/* L130: */ + } + *(unsigned char *)transa = 'C'; + *(unsigned char *)transb = 'N'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + s_wsfe(&io___70); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + *(unsigned char *)transb = 'C'; + cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lce_(cc, ct, &n); + if (! same || err != 0.f) { + s_wsfe(&io___71); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + +/* Test each subroutine in turn. */ + + for (isnum = 1; isnum <= 9; ++isnum) { + s_wsle(&io___73); + e_wsle(); + if (! ltest[isnum - 1]) { +/* Subprogram is not to be tested. */ + s_wsfe(&io___74); + do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13); + e_wsfe(); + } else { + s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, ( + ftnlen)13); +/* Test error exits. */ + if (tsterr) { + cc3chke_(snames + (isnum - 1) * 13); + s_wsle(&io___75); + e_wsle(); + } +/* Test computations. */ + infoc_1.infot = 0; + infoc_1.ok = TRUE_; + fatal = FALSE_; + switch (isnum) { + case 1: goto L140; + case 2: goto L150; + case 3: goto L150; + case 4: goto L160; + case 5: goto L160; + case 6: goto L170; + case 7: goto L170; + case 8: goto L180; + case 9: goto L180; + } +/* Test CGEMM, 01. */ +L140: + if (corder) { + cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CHEMM, 02, CSYMM, 03. */ +L150: + if (corder) { + cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CTRMM, 04, CTRSM, 05. */ +L160: + if (corder) { + cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__0); + } + if (rorder) { + cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__1); + } + goto L190; +/* Test CHERK, 06, CSYRK, 07. */ +L170: + if (corder) { + cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test CHER2K, 08, CSYR2K, 09. */ +L180: + if (corder) { + cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__0); + } + if (rorder) { + cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__1); + } + goto L190; + +L190: + if (fatal && sfatal) { + goto L210; + } + } +/* L200: */ + } + s_wsfe(&io___82); + e_wsfe(); + goto L230; + +L210: + s_wsfe(&io___83); + e_wsfe(); + goto L230; + +L220: + s_wsfe(&io___84); + e_wsfe(); + +L230: + if (trace) { + cl__1.cerr = 0; + cl__1.cunit = ntra; + cl__1.csta = 0; + f_clos(&cl__1); + } + cl__1.cerr = 0; + cl__1.cunit = 6; + cl__1.csta = 0; + f_clos(&cl__1); + s_stop("", (ftnlen)0); + + +/* End of CBLAT3. */ + + return 0; +} /* MAIN__ */ + +/* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char ich[3] = "NTC"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ****** FATAL ERROR - ERROR-CALL MYEXIT T" + "AKEN ON VALID\002,\002 CALL ******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7, i__8; + alist al__1; + + /* Local variables */ + complex beta; + integer ldas, ldbs, ldcs; + logical same, null; + integer i__, k, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13], trana, tranb; + integer nargs; + logical reset; + extern /* Subroutine */ int cprcn1_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, integer *, complex *, + integer *, integer *, complex *, integer *); + integer ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + char tranas[1], tranbs[1], transa[1], transb[1]; + real errmax; + integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + complex als, bls; + real err; + extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer * + , integer *, integer *, complex *, complex *, integer *, complex * + , integer *, complex *, complex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___128 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___131 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___133 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___134 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___135 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___136 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___137 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests CGEMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 13; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; + + i__3 = *nidim; + for (ik = 1; ik <= i__3; ++ik) { + k = idim[ik]; + + for (ica = 1; ica <= 3; ++ica) { + *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] + ; + trana = *(unsigned char *)transa == 'T' || *(unsigned + char *)transa == 'C'; + + if (trana) { + ma = k; + na = m; + } else { + ma = m; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[ + 1], &lda, &reset, &c_b1); + + for (icb = 1; icb <= 3; ++icb) { + *(unsigned char *)transb = *(unsigned char *)&ich[icb + - 1]; + tranb = *(unsigned char *)transb == 'T' || *(unsigned + char *)transb == 'C'; + + if (tranb) { + mb = n; + nb = k; + } else { + mb = k; + nb = n; + } +/* Set LDB to 1 more than minimum value if room. */ + ldb = mb; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L70; + } + lbb = ldb * nb; + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, & + bb[1], &ldb, &reset, &c_b1); + + i__4 = *nalf; + for (ia = 1; ia <= i__4; ++ia) { + i__5 = ia; + alpha.r = alf[i__5].r, alpha.i = alf[i__5].i; + + i__5 = *nbet; + for (ib = 1; ib <= i__5; ++ib) { + i__6 = ib; + beta.r = bet[i__6].r, beta.i = bet[i__6].i; + +/* Generate the matrix C. */ + + cmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)tranbs = *(unsigned char *) + transb; + ms = m; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__6 = laa; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + as[i__7].r = aa[i__8].r, as[i__7].i = aa[ + i__8].i; +/* L10: */ + } + ldas = lda; + i__6 = lbb; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[ + i__8].i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__6 = lcc; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[ + i__8].i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + cprcn1_(ntra, &nc, sname, iorder, transa, + transb, &m, &n, &k, &alpha, &lda, + &ldb, &beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + ccgemm3m_(iorder, transa, transb, &m, &n, &k, + &alpha, &aa[1], &lda, &bb[1], &ldb, & + beta, &cc[1], &ldc); + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___128.ciunit = *nout; + s_wsfe(&io___128); + e_wsfe(); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)transa == *( + unsigned char *)tranas; + isame[1] = *(unsigned char *)transb == *( + unsigned char *)tranbs; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = ks == k; + isame[5] = als.r == alpha.r && als.i == + alpha.i; + isame[6] = lce_(&as[1], &aa[1], &laa); + isame[7] = ldas == lda; + isame[8] = lce_(&bs[1], &bb[1], &lbb); + isame[9] = ldbs == ldb; + isame[10] = bls.r == beta.r && bls.i == + beta.i; + if (null) { + isame[11] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[11] = lceres_("ge", " ", &m, &n, & + cs[1], &cc[1], &ldc); + } + isame[12] = ldcs == ldc; + +/* If data was incorrectly changed, report */ +/* and return. */ + + same = TRUE_; + i__6 = nargs; + for (i__ = 1; i__ <= i__6; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___131.ciunit = *nout; + s_wsfe(&io___131); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result. */ + + cmmch_(transa, transb, &m, &n, &k, &alpha, + &a[a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], + nmax, &ct[1], &g[1], &cc[1], &ldc, + eps, &err, fatal, nout, &c_true); + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L120; + } + } + +/* L50: */ + } + +/* L60: */ + } + +L70: + ; + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* L110: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___133.ciunit = *nout; + s_wsfe(&io___133); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___134.ciunit = *nout; + s_wsfe(&io___134); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___135.ciunit = *nout; + s_wsfe(&io___135); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + if (*iorder == 1) { + io___136.ciunit = *nout; + s_wsfe(&io___136); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + } + goto L130; + +L120: + io___137.ciunit = *nout; + s_wsfe(&io___137); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + cprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & + lda, &ldb, &beta, &ldc); + +L130: + return 0; + +/* L9995: */ + +/* End of CCHK1. */ + +} /* cchk1_ */ + + +/* Subroutine */ int cprcn1_(integer *nout, integer *nc, char *sname, integer + *iorder, char *transa, char *transb, integer *m, integer *n, integer * + k, complex *alpha, integer *lda, integer *ldb, complex *beta, integer + *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002" + ",\002,f4.1,\002) , C,\002,i3,\002).\002)"; + + /* Local variables */ + char crc[14], cta[14], ctb[14]; + + /* Fortran I/O blocks */ + static cilist io___141 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___142 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)transa == 'N') { + s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(cta, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transb == 'N') { + s_copy(ctb, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transb == 'T') { + s_copy(ctb, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___141.ciunit = *nout; + s_wsfe(&io___141); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cta, (ftnlen)14); + do_fio(&c__1, ctb, (ftnlen)14); + e_wsfe(); + io___142.ciunit = *nout; + s_wsfe(&io___142); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn1_ */ + + +/* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char ichs[2] = "LR"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + alist al__1; + + /* Local variables */ + complex beta; + integer ldas, ldbs, ldcs; + logical same; + char side[1]; + logical conj, left, null; + char uplo[1]; + integer i__, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13]; + char sides[1]; + integer nargs; + logical reset; + char uplos[1]; + extern /* Subroutine */ int cprcn2_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + integer *, complex *, integer *); + integer ia, ib, na, nc, im, in; + extern /* Subroutine */ int cchemm_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + integer ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + extern /* Subroutine */ int ccsymm_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + real errmax; + integer laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + integer ics; + complex als, bls; + integer icu; + real err; + + /* Fortran I/O blocks */ + static cilist io___181 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___184 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___186 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___187 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___188 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___189 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___190 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests CHEMM and CSYMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L90; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L90; + } + lbb = ldb * n; + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, & + reset, &c_b1); + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + +/* Generate the hermitian or symmetric matrix A. */ + + cmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax, + &aa[1], &lda, &reset, &c_b1); + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + +/* Generate the matrix C. */ + + cmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *)side; + *(unsigned char *)uplos = *(unsigned char *)uplo; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + cprcn2_(ntra, &nc, sname, iorder, side, uplo, + &m, &n, &alpha, &lda, &ldb, &beta, & + ldc) + ; + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + if (conj) { + cchemm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } else { + ccsymm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___181.ciunit = *nout; + s_wsfe(&io___181); + e_wsfe(); + *fatal = TRUE_; + goto L110; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *(unsigned + char *)side; + isame[1] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lce_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + isame[9] = bls.r == beta.r && bls.i == beta.i; + if (null) { + isame[10] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lceres_("ge", " ", &m, &n, &cs[1], + &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___184.ciunit = *nout; + s_wsfe(&io___184); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L110; + } + + if (! null) { + +/* Check the result. */ + + if (left) { + cmmch_("N", "N", &m, &n, &m, &alpha, &a[ + a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } else { + cmmch_("N", "N", &m, &n, &n, &alpha, &b[ + b_offset], nmax, &a[a_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +L90: + ; + } + +/* L100: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___186.ciunit = *nout; + s_wsfe(&io___186); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___187.ciunit = *nout; + s_wsfe(&io___187); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___188.ciunit = *nout; + s_wsfe(&io___188); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + if (*iorder == 1) { + io___189.ciunit = *nout; + s_wsfe(&io___189); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + } + goto L120; + +L110: + io___190.ciunit = *nout; + s_wsfe(&io___190); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + cprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, + &beta, &ldc); + +L120: + return 0; + +/* L9995: */ + +/* End of CCHK2. */ + +} /* cchk2_ */ + + +/* Subroutine */ int cprcn2_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, integer *m, integer *n, complex * + alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002," + "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)"; + + /* Local variables */ + char cs[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___194 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___195 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___194.ciunit = *nout; + s_wsfe(&io___194); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cs, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + e_wsfe(); + io___195.ciunit = *nout; + s_wsfe(&io___195); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn2_ */ + + +/* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, + complex *bs, complex *ct, real *g, complex *c__, integer *iorder) +{ + /* Initialized data */ + + static char ichu[2] = "UL"; + static char icht[3] = "NTC"; + static char ichd[2] = "UN"; + static char ichs[2] = "LR"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + complex q__1; + alist al__1; + + /* Local variables */ + char diag[1]; + integer ldas, ldbs; + logical same; + char side[1]; + logical left, null; + char uplo[1]; + integer i__, j, m, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + char diags[1]; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + logical isame[13]; + char sides[1]; + integer nargs; + logical reset; + char uplos[1]; + extern /* Subroutine */ int cprcn3_(integer *, integer *, char *, integer + *, char *, char *, char *, char *, integer *, integer *, complex * + , integer *, integer *); + integer ia, na, nc, im, in, ms, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + extern /* Subroutine */ int cctrmm_(integer *, char *, char *, char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *); + char tranas[1], transa[1]; + extern /* Subroutine */ int cctrsm_(integer *, char *, char *, char *, + char *, integer *, integer *, complex *, complex *, integer *, + complex *, integer *); + real errmax; + integer laa, icd, lbb, lda, ldb; + extern logical lce_(complex *, complex *, integer *); + integer ics; + complex als; + integer ict, icu; + real err; + + /* Fortran I/O blocks */ + static cilist io___236 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___239 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___241 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___242 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___243 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___244 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___245 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests CTRMM and CTRSM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --g; + --ct; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 11; + nc = 0; + reset = TRUE_; + errmax = 0.f; +/* Set up zero matrix for CMMCH. */ + i__1 = *nmax; + for (j = 1; j <= i__1; ++j) { + i__2 = *nmax; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * c_dim1; + c__[i__3].r = 0.f, c__[i__3].i = 0.f; +/* L10: */ + } +/* L20: */ + } + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L130; + } + lbb = ldb * n; + null = m <= 0 || n <= 0; + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L130; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + + for (ict = 1; ict <= 3; ++ict) { + *(unsigned char *)transa = *(unsigned char *)&icht[ + ict - 1]; + + for (icd = 1; icd <= 2; ++icd) { + *(unsigned char *)diag = *(unsigned char *)&ichd[ + icd - 1]; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + +/* Generate the matrix A. */ + + cmake_("tr", uplo, diag, &na, &na, &a[ + a_offset], nmax, &aa[1], &lda, &reset, + &c_b1); + +/* Generate the matrix B. */ + + cmake_("ge", " ", " ", &m, &n, &b[b_offset], + nmax, &bb[1], &ldb, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *) + side; + *(unsigned char *)uplos = *(unsigned char *) + uplo; + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)diags = *(unsigned char *) + diag; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__4 = laa; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + as[i__5].r = aa[i__6].r, as[i__5].i = aa[ + i__6].i; +/* L30: */ + } + ldas = lda; + i__4 = lbb; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[ + i__6].i; +/* L40: */ + } + ldbs = ldb; + +/* Call the subroutine. */ + + if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen) + 2) == 0) { + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)13, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + cctrmm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } else if (s_cmp(sname + 9, "sm", (ftnlen)2, ( + ftnlen)2) == 0) { + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)13, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + cctrsm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___236.ciunit = *nout; + s_wsfe(&io___236); + e_wsfe(); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *( + unsigned char *)side; + isame[1] = *(unsigned char *)uplos == *( + unsigned char *)uplo; + isame[2] = *(unsigned char *)tranas == *( + unsigned char *)transa; + isame[3] = *(unsigned char *)diags == *( + unsigned char *)diag; + isame[4] = ms == m; + isame[5] = ns == n; + isame[6] = als.r == alpha.r && als.i == + alpha.i; + isame[7] = lce_(&as[1], &aa[1], &laa); + isame[8] = ldas == lda; + if (null) { + isame[9] = lce_(&bs[1], &bb[1], &lbb); + } else { + isame[9] = lceres_("ge", " ", &m, &n, &bs[ + 1], &bb[1], &ldb); + } + isame[10] = ldbs == ldb; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__4 = nargs; + for (i__ = 1; i__ <= i__4; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___239.ciunit = *nout; + s_wsfe(&io___239); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L50: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + if (s_cmp(sname + 9, "mm", (ftnlen)2, ( + ftnlen)2) == 0) { + +/* Check the result. */ + + if (left) { + cmmch_(transa, "N", &m, &n, &m, & + alpha, &a[a_offset], nmax, + &b[b_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } else { + cmmch_("N", transa, &m, &n, &n, & + alpha, &b[b_offset], nmax, + &a[a_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true); + } + } else if (s_cmp(sname + 9, "sm", (ftnlen) + 2, (ftnlen)2) == 0) { + +/* Compute approximation to original */ +/* matrix. */ + + i__4 = n; + for (j = 1; j <= i__4; ++j) { + i__5 = m; + for (i__ = 1; i__ <= i__5; ++i__) + { + i__6 = i__ + j * c_dim1; + i__7 = i__ + (j - 1) * ldb; + c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; + i__6 = i__ + (j - 1) * ldb; + i__7 = i__ + j * b_dim1; + q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + q__1.i = alpha.r * b[i__7].i + alpha.i * b[ + i__7].r; + bb[i__6].r = q__1.r, bb[i__6].i = q__1.i; +/* L60: */ + } +/* L70: */ + } + + if (left) { + cmmch_(transa, "N", &m, &n, &m, & + c_b2, &a[a_offset], nmax, + &c__[c_offset], nmax, & + c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } else { + cmmch_("N", transa, &m, &n, &n, & + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L150; + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +/* L110: */ + } + +/* L120: */ + } + +L130: + ; + } + +/* L140: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___241.ciunit = *nout; + s_wsfe(&io___241); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___242.ciunit = *nout; + s_wsfe(&io___242); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___243.ciunit = *nout; + s_wsfe(&io___243); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + if (*iorder == 1) { + io___244.ciunit = *nout; + s_wsfe(&io___244); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + } + goto L160; + +L150: + io___245.ciunit = *nout; + s_wsfe(&io___245); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (*trace) { + cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & + alpha, &lda, &ldb); + } + +L160: + return 0; + +/* L9995: */ + +/* End of CCHK3. */ + +} /* cchk3_ */ + + +/* Subroutine */ int cprcn3_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, char *transa, char *diag, integer *m, + integer *n, complex *alpha, integer *lda, integer *ldb) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,2(a14,\002,\002),2(i3,\002,\002),\002 " + "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)." + "\002)"; + + /* Local variables */ + char ca[14], cd[14], cs[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___251 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___252 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)diag == 'N') { + s_copy(cd, " CblasNonUnit", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cd, " CblasUnit", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___251.ciunit = *nout; + s_wsfe(&io___251); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cs, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + e_wsfe(); + io___252.ciunit = *nout; + s_wsfe(&io___252); + do_fio(&c__1, ca, (ftnlen)14); + do_fio(&c__1, cd, (ftnlen)14); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn3_ */ + + +/* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex * + as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, + complex *cs, complex *ct, real *g, integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + complex q__1; + alist al__1; + + /* Local variables */ + complex beta; + integer ldas, ldcs; + logical same, conj; + complex bets; + real rals; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + real rbeta; + logical isame[13]; + integer nargs; + real rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + extern /* Subroutine */ int cprcn4_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + complex *, integer *), cprcn6_(integer *, + integer *, char *, integer *, char *, char *, integer *, integer * + , real *, integer *, real *, integer *); + integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks; + extern /* Subroutine */ int ccherk_(integer *, char *, char *, integer *, + integer *, real *, complex *, integer *, real *, complex *, + integer *); + integer ns; + real ralpha; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + real errmax; + extern /* Subroutine */ int ccsyrk_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, complex *, + integer *); + char transs[1], transt[1]; + integer laa, lda, lcc, ldc; + extern logical lce_(complex *, complex *, integer *); + complex als; + integer ict, icu; + real err; + + /* Fortran I/O blocks */ + static cilist io___294 = { 0, 0, 0, fmt_9992, 0 }; + static cilist io___297 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___304 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___305 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___306 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___307 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___308 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___309 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests CHERK and CSYRK. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 10; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], & + lda, &reset, &c_b1); + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + if (conj) { + ralpha = alpha.r; + q__1.r = ralpha, q__1.i = 0.f; + alpha.r = q__1.r, alpha.i = q__1.i; + } + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + q__1.r = rbeta, q__1.i = 0.f; + beta.r = q__1.r, beta.i = q__1.i; + } + null = n <= 0; + if (conj) { + null = null || (k <= 0 || ralpha == 0.f) && + rbeta == 1.f; + } + +/* Generate the matrix C. */ + + cmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + if (conj) { + rals = ralpha; + } else { + als.r = alpha.r, als.i = alpha.i; + } + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L20: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + cprcn6_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &ralpha, &lda, & + rbeta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + ccherk_(iorder, uplo, trans, &n, &k, &ralpha, + &aa[1], &lda, &rbeta, &cc[1], &ldc); + } else { + if (*trace) { + cprcn4_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + ccsyrk_(iorder, uplo, trans, &n, &k, &alpha, & + aa[1], &lda, &beta, &cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___294.ciunit = *nout; + s_wsfe(&io___294); + e_wsfe(); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + if (conj) { + isame[4] = rals == ralpha; + } else { + isame[4] = als.r == alpha.r && als.i == + alpha.i; + } + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + if (conj) { + isame[7] = rbets == rbeta; + } else { + isame[7] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[8] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[8] = lceres_(sname + 7, uplo, &n, &n, & + cs[1], &cc[1], &ldc); + } + isame[9] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___297.ciunit = *nout; + s_wsfe(&io___297); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L30: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + cmmch_(transt, "N", &lj, &c__1, &k, & + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + cmmch_("N", transt, &lj, &c__1, &k, & + alpha, &a[jj + a_dim1], nmax, + &a[j + a_dim1], nmax, &beta, & + c__[jj + j * c_dim1], nmax, & + ct[1], &g[1], &cc[jc], &ldc, + eps, &err, fatal, nout, & + c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } +/* L40: */ + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___304.ciunit = *nout; + s_wsfe(&io___304); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___305.ciunit = *nout; + s_wsfe(&io___305); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___306.ciunit = *nout; + s_wsfe(&io___306); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + if (*iorder == 1) { + io___307.ciunit = *nout; + s_wsfe(&io___307); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + } + goto L130; + +L110: + if (n > 1) { + io___308.ciunit = *nout; + s_wsfe(&io___308); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L120: + io___309.ciunit = *nout; + s_wsfe(&io___309); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (conj) { + cprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, + &rbeta, &ldc); + } else { + cprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + +L130: + return 0; + +/* L9994: */ +/* L9993: */ + +/* End of CCHK4. */ + +} /* cchk4_ */ + + +/* Subroutine */ int cprcn4_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, complex *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C" + ",\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___313 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___314 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___313.ciunit = *nout; + s_wsfe(&io___313); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___314.ciunit = *nout; + s_wsfe(&io___314); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn4_ */ + + + +/* Subroutine */ int cprcn6_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, real * + alpha, integer *lda, real *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3" + ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___318 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___319 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___318.ciunit = *nout; + s_wsfe(&io___318); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___319.ciunit = *nout; + s_wsfe(&io___319); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn6_ */ + + +/* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer * + nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, + integer *nidim, integer *idim, integer *nalf, complex *alf, integer * + nbet, complex *bet, integer *nmax, complex *ab, complex *aa, complex * + as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, + complex *ct, real *g, complex *w, integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; + complex q__1, q__2; + alist al__1; + + /* Local variables */ + integer jjab; + complex beta; + integer ldas, ldbs, ldcs; + logical same, conj; + complex bets; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, logical *, + complex *); + complex alpha; + extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *, complex *, real *, complex *, + integer *, real *, real *, logical *, integer *, logical *); + real rbeta; + logical isame[13]; + integer nargs; + real rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + extern /* Subroutine */ int cprcn5_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, complex *, integer *, + integer *, complex *, integer *), cprcn7_( + integer *, integer *, char *, integer *, char *, char *, integer * + , integer *, complex *, integer *, integer *, real *, integer *); + integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns; + extern logical lceres_(char *, char *, integer *, integer *, complex *, + complex *, integer *); + real errmax; + char transs[1], transt[1]; + extern /* Subroutine */ int ccher2k_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + real *, complex *, integer *); + integer laa, lbb, lda, lcc, ldb, ldc; + extern logical lce_(complex *, complex *, integer *); + extern /* Subroutine */ int ccsyr2k_(integer *, char *, char *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + complex als; + integer ict, icu; + real err; + + /* Fortran I/O blocks */ + static cilist io___362 = { 0, 0, 0, fmt_9992, 0 }; + static cilist io___365 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___373 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___374 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___375 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___376 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___377 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___378 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests CHER2K and CSYR2K. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --w; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + --as; + --aa; + --ab; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.f; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L130; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L110; + } + laa = lda * na; + +/* Generate the matrix A. */ + + if (tran) { + i__3 = *nmax << 1; + cmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], & + lda, &reset, &c_b1); + } else { + cmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], & + lda, &reset, &c_b1); + } + +/* Generate the matrix B. */ + + ldb = lda; + lbb = laa; + if (tran) { + i__3 = *nmax << 1; + cmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1] + , &ldb, &reset, &c_b1); + } else { + cmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax, + &bb[1], &ldb, &reset, &c_b1); + } + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + q__1.r = rbeta, q__1.i = 0.f; + beta.r = q__1.r, beta.i = q__1.i; + } + null = n <= 0; + if (conj) { + null = null || (k <= 0 || alpha.r == 0.f && + alpha.i == 0.f) && rbeta == 1.f; + } + +/* Generate the matrix C. */ + + cmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + cprcn7_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &rbeta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + ccher2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &rbeta, & + cc[1], &ldc); + } else { + if (*trace) { + cprcn5_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + ccsyr2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &beta, & + cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___362.ciunit = *nout; + s_wsfe(&io___362); + e_wsfe(); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lce_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lce_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + if (conj) { + isame[9] = rbets == rbeta; + } else { + isame[9] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[10] = lce_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lceres_("he", uplo, &n, &n, &cs[1] + , &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___365.ciunit = *nout; + s_wsfe(&io___365); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jjab = 1; + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = (j - 1 << 1) * *nmax + k + + i__; + q__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, + q__1.i = alpha.r * ab[ + i__8].i + alpha.i * ab[ + i__8].r; + w[i__7].r = q__1.r, w[i__7].i = + q__1.i; + if (conj) { + i__7 = k + i__; + r_cnjg(&q__2, &alpha); + i__8 = (j - 1 << 1) * *nmax + i__; + q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, + q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[ + i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } else { + i__7 = k + i__; + i__8 = (j - 1 << 1) * *nmax + i__; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } +/* L50: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + i__8 = *nmax << 1; + cmmch_(transt, "N", &lj, &c__1, &i__6, + &c_b2, &ab[jjab], &i__7, &w[ + 1], &i__8, &beta, &c__[jj + j + * c_dim1], nmax, &ct[1], &g[1] + , &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + if (conj) { + i__7 = i__; + r_cnjg(&q__2, &ab[(k + i__ - 1) * *nmax + j]); + q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, + q__1.i = alpha.r * q__2.i + alpha.i * + q__2.r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + q__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__2.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + r_cnjg(&q__1, &q__2); + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } else { + i__7 = i__; + i__8 = (k + i__ - 1) * *nmax + j; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, q__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = q__1.r, w[i__7].i = q__1.i; + } +/* L60: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + cmmch_("N", "N", &lj, &c__1, &i__6, & + c_b2, &ab[jj], nmax, &w[1], & + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + if (tran) { + jjab += *nmax << 1; + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L140; + } +/* L70: */ + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +L110: + ; + } + +/* L120: */ + } + +L130: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___373.ciunit = *nout; + s_wsfe(&io___373); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___374.ciunit = *nout; + s_wsfe(&io___374); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___375.ciunit = *nout; + s_wsfe(&io___375); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + if (*iorder == 1) { + io___376.ciunit = *nout; + s_wsfe(&io___376); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); + e_wsfe(); + } + } + goto L160; + +L140: + if (n > 1) { + io___377.ciunit = *nout; + s_wsfe(&io___377); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L150: + io___378.ciunit = *nout; + s_wsfe(&io___378); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (conj) { + cprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &rbeta, &ldc); + } else { + cprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &beta, &ldc); + } + +L160: + return 0; + +/* L9994: */ +/* L9993: */ + +/* End of CCHK5. */ + +} /* cchk5_ */ + + +/* Subroutine */ int cprcn5_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002" + ",f4.1,\002), C,\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___382 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___383 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___382.ciunit = *nout; + s_wsfe(&io___382); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___383.ciunit = *nout; + s_wsfe(&io___383); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn5_ */ + + + +/* Subroutine */ int cprcn7_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, complex * + alpha, integer *lda, integer *ldb, real *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C," + "\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___387 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___388 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___387.ciunit = *nout; + s_wsfe(&io___387); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___388.ciunit = *nout; + s_wsfe(&io___388); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* cprcn7_ */ + + +/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, complex *a, integer *nmax, complex *aa, integer *lda, + logical *reset, complex *transl) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + real r__1; + complex q__1, q__2; + + /* Local variables */ + extern /* Complex */ VOID cbeg_(complex *, logical *); + integer ibeg, iend; + logical unit; + integer i__, j; + logical lower, upper; + integer jj; + logical gen, her, tri, sym; + + +/* Generates values for an M by N matrix A. */ +/* Stores the values in the array AA in the data structure required */ +/* by the routine, with unwanted elements set to rogue value. */ + +/* TYPE is 'ge', 'he', 'sy' or 'tr'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --aa; + + /* Function Body */ + gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0; + her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0; + sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0; + tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0; + upper = (her || sym || tri) && *(unsigned char *)uplo == 'U'; + lower = (her || sym || tri) && *(unsigned char *)uplo == 'L'; + unit = tri && *(unsigned char *)diag == 'U'; + +/* Generate data in array A. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + if (gen || upper && i__ <= j || lower && i__ >= j) { + i__3 = i__ + j * a_dim1; + cbeg_(&q__2, reset); + q__1.r = q__2.r + transl->r, q__1.i = q__2.i + transl->i; + a[i__3].r = q__1.r, a[i__3].i = q__1.i; + if (i__ != j) { +/* Set some elements to zero */ + if (*n > 3 && j == *n / 2) { + i__3 = i__ + j * a_dim1; + a[i__3].r = 0.f, a[i__3].i = 0.f; + } + if (her) { + i__3 = j + i__ * a_dim1; + r_cnjg(&q__1, &a[i__ + j * a_dim1]); + a[i__3].r = q__1.r, a[i__3].i = q__1.i; + } else if (sym) { + i__3 = j + i__ * a_dim1; + i__4 = i__ + j * a_dim1; + a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i; + } else if (tri) { + i__3 = j + i__ * a_dim1; + a[i__3].r = 0.f, a[i__3].i = 0.f; + } + } + } +/* L10: */ + } + if (her) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + r__1 = a[i__3].r; + q__1.r = r__1, q__1.i = 0.f; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + } + if (tri) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + q__1.r = a[i__3].r + 1.f, q__1.i = a[i__3].i + 0.f; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + } + if (unit) { + i__2 = j + j * a_dim1; + a[i__2].r = 1.f, a[i__2].i = 0.f; + } +/* L20: */ + } + +/* Store elements in array AS in data structure required by routine. */ + + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L30: */ + } + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L40: */ + } +/* L50: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen) + 2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + if (unit) { + iend = j - 1; + } else { + iend = j; + } + } else { + if (unit) { + ibeg = j + 1; + } else { + ibeg = j; + } + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L60: */ + } + i__2 = iend; + for (i__ = ibeg; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L70: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10f, aa[i__3].i = 1e10f; +/* L80: */ + } + if (her) { + jj = j + (j - 1) * *lda; + i__2 = jj; + i__3 = jj; + r__1 = aa[i__3].r; + q__1.r = r__1, q__1.i = -1e10f; + aa[i__2].r = q__1.r, aa[i__2].i = q__1.i; + } +/* L90: */ + } + } + return 0; + +/* End of CMAKE. */ + +} /* cmake_ */ + +/* Subroutine */ int cmmch_(char *transa, char *transb, integer *m, integer * + n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, + integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, + real *g, complex *cc, integer *ldcc, real *eps, real *err, logical * + fatal, integer *nout, logical *mv) +{ + /* Format strings */ + static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS" + " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002 " + " EXPECTED RE\002,\002SULT COMPUTED R" + "ESULT\002)"; + static char fmt_9998[] = "(1x,i7,2(\002 (\002,g15.6,\002,\002,g15.6," + "\002)\002))"; + static char fmt_9997[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; + real r__1, r__2, r__3, r__4, r__5, r__6; + complex q__1, q__2, q__3, q__4; + + /* Local variables */ + real erri; + integer i__, j, k; + logical trana, tranb, ctrana, ctranb; + + /* Fortran I/O blocks */ + static cilist io___409 = { 0, 0, 0, fmt_9999, 0 }; + static cilist io___410 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___411 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___412 = { 0, 0, 0, fmt_9997, 0 }; + + + +/* Checks the results of the computational tests. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --ct; + --g; + cc_dim1 = *ldcc; + cc_offset = 1 + cc_dim1 * 1; + cc -= cc_offset; + + /* Function Body */ + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + 'C'; + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + 'C'; + ctrana = *(unsigned char *)transa == 'C'; + ctranb = *(unsigned char *)transb == 'C'; + +/* Compute expected result, one column at a time, in CT using data */ +/* in A, B and C. */ +/* Compute gauges in G. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + ct[i__3].r = 0.f, ct[i__3].i = 0.f; + g[i__] = 0.f; +/* L10: */ + } + if (! trana && ! tranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = k + j * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ + i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag( + &a[i__ + k * a_dim1]), abs(r__2))) * ((r__3 = b[ + i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * + b_dim1]), abs(r__4))); +/* L20: */ + } +/* L30: */ + } + } else if (trana && ! tranb) { + if (ctrana) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + i__6 = k + j * b_dim1; + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.i = q__3.r * b[i__6].i + q__3.i * b[i__6] + .r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[k + j * b_dim1]), abs(r__4))); +/* L40: */ + } +/* L50: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = k + j * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[k + j * b_dim1]), abs(r__4))); +/* L60: */ + } +/* L70: */ + } + } + } else if (! trana && tranb) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + r_cnjg(&q__3, &b[j + k * b_dim1]); + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__3.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[j + k * b_dim1]), abs(r__4))); +/* L80: */ + } +/* L90: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = j + k * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, q__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * (( + r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag( + &b[j + k * b_dim1]), abs(r__4))); +/* L100: */ + } +/* L110: */ + } + } + } else if (trana && tranb) { + if (ctrana) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + r_cnjg(&q__4, &b[j + k * b_dim1]); + q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, + q__2.i = q__3.r * q__4.i + q__3.i * + q__4.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L120: */ + } +/* L130: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + r_cnjg(&q__3, &a[k + i__ * a_dim1]); + i__6 = j + k * b_dim1; + q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, + q__2.i = q__3.r * b[i__6].i + q__3.i * b[ + i__6].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L140: */ + } +/* L150: */ + } + } + } else { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + r_cnjg(&q__3, &b[j + k * b_dim1]); + q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, + q__2.i = a[i__6].r * q__3.i + a[i__6].i * + q__3.r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L160: */ + } +/* L170: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = j + k * b_dim1; + q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ + i__7].i, q__2.i = a[i__6].r * b[i__7].i + + a[i__6].i * b[i__7].r; + q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + + q__2.i; + ct[i__4].r = q__1.r, ct[i__4].i = q__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = + r_imag(&a[k + i__ * a_dim1]), abs(r__2))) + * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 + = r_imag(&b[j + k * b_dim1]), abs(r__4))); +/* L180: */ + } +/* L190: */ + } + } + } + } + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = + alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; + i__5 = i__ + j * c_dim1; + q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = + beta->r * c__[i__5].i + beta->i * c__[i__5].r; + q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; + ct[i__3].r = q__1.r, ct[i__3].i = q__1.i; + i__3 = i__ + j * c_dim1; + g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), + abs(r__2))) * g[i__] + ((r__3 = beta->r, abs(r__3)) + ( + r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, + abs(r__5)) + (r__6 = r_imag(&c__[i__ + j * c_dim1]), abs( + r__6))); +/* L200: */ + } + +/* Compute the error ratio for this result. */ + + *err = 0.f; + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__ + j * cc_dim1; + q__2.r = ct[i__3].r - cc[i__4].r, q__2.i = ct[i__3].i - cc[i__4] + .i; + q__1.r = q__2.r, q__1.i = q__2.i; + erri = ((r__1 = q__1.r, abs(r__1)) + (r__2 = r_imag(&q__1), abs( + r__2))) / *eps; + if (g[i__] != 0.f) { + erri /= g[i__]; + } + *err = f2cmax(*err,erri); + if (*err * sqrt(*eps) >= 1.f) { + goto L230; + } +/* L210: */ + } + +/* L220: */ + } + +/* If the loop completes, all results are at least half accurate. */ + goto L250; + +/* Report fatal error. */ + +L230: + *fatal = TRUE_; + io___409.ciunit = *nout; + s_wsfe(&io___409); + e_wsfe(); + i__1 = *m; + for (i__ = 1; i__ <= i__1; ++i__) { + if (*mv) { + io___410.ciunit = *nout; + s_wsfe(&io___410); + do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real)); + do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real) + ); + e_wsfe(); + } else { + io___411.ciunit = *nout; + s_wsfe(&io___411); + do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real) + ); + do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real)); + e_wsfe(); + } +/* L240: */ + } + if (*n > 1) { + io___412.ciunit = *nout; + s_wsfe(&io___412); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L250: + return 0; + + +/* End of CMMCH. */ + +} /* cmmch_ */ + +logical lce_(complex *ri, complex *rj, integer *lr) +{ + /* System generated locals */ + integer i__1, i__2, i__3; + logical ret_val; + + /* Local variables */ + integer i__; + + +/* Tests if two arrays are identical. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --rj; + --ri; + + /* Function Body */ + i__1 = *lr; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) { + goto L20; + } +/* L10: */ + } + ret_val = TRUE_; + goto L30; +L20: + ret_val = FALSE_; +L30: + return ret_val; + +/* End of LCE. */ + +} /* lce_ */ + +logical lceres_(char *type__, char *uplo, integer *m, integer *n, complex *aa, + complex *as, integer *lda) +{ + /* System generated locals */ + integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; + logical ret_val; + + /* Local variables */ + integer ibeg, iend, i__, j; + logical upper; + + +/* Tests if selected elements in two arrays are equal. */ + +/* TYPE is 'ge' or 'he' or 'sy'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + as_dim1 = *lda; + as_offset = 1 + as_dim1 * 1; + as -= as_offset; + aa_dim1 = *lda; + aa_offset = 1 + aa_dim1 * 1; + aa -= aa_offset; + + /* Function Body */ + upper = *(unsigned char *)uplo == 'U'; + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L10: */ + } +/* L20: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + iend = j; + } else { + ibeg = j; + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L30: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L40: */ + } +/* L50: */ + } + } + +/* L60: */ + ret_val = TRUE_; + goto L80; +L70: + ret_val = FALSE_; +L80: + return ret_val; + +/* End of LCERES. */ + +} /* lceres_ */ + +/* Complex */ VOID cbeg_(complex * ret_val, logical *reset) +{ + /* System generated locals */ + real r__1, r__2; + complex q__1; + + /* Local variables */ + static integer i__, j, ic, mi, mj; + + +/* Generates complex numbers as pairs of random numbers uniformly */ +/* distributed between -0.5 and 0.5. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + if (*reset) { +/* Initialize local variables. */ + mi = 891; + mj = 457; + i__ = 7; + j = 7; + ic = 0; + *reset = FALSE_; + } + +/* The sequence of values of I or J is bounded between 1 and 999. */ +/* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */ +/* If initial I or J = 4 or 8, the period will be 25. */ +/* If initial I or J = 5, the period will be 10. */ +/* IC is used to break up the period by skipping 1 value of I or J */ +/* in 6. */ + + ++ic; +L10: + i__ *= mi; + j *= mj; + i__ -= i__ / 1000 * 1000; + j -= j / 1000 * 1000; + if (ic >= 5) { + ic = 0; + goto L10; + } + r__1 = (i__ - 500) / 1001.f; + r__2 = (j - 500) / 1001.f; + q__1.r = r__1, q__1.i = r__2; + ret_val->r = q__1.r, ret_val->i = q__1.i; + return ; + +/* End of CBEG. */ + +} /* cbeg_ */ + +real sdiff_(real *x, real *y) +{ + /* System generated locals */ + real ret_val; + + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + ret_val = *x - *y; + return ret_val; + +/* End of SDIFF. */ + +} /* sdiff_ */ + +/* Main program alias */ int cblat3_ () { MAIN__ (); return 0; } diff --git a/ctest/c_zblat3c_3m.c b/ctest/c_zblat3c_3m.c new file mode 100644 index 000000000..059daccb5 --- /dev/null +++ b/ctest/c_zblat3c_3m.c @@ -0,0 +1,4897 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i= 0; + if (trace) { + o__1.oerr = 0; + o__1.ounit = ntra; + o__1.ofnmlen = 32; + o__1.ofnm = snaps; + o__1.orl = 0; + o__1.osta = "NEW"; + o__1.oacc = 0; + o__1.ofm = 0; + o__1.oblnk = 0; + f_open(&o__1); + } +/* Read the flag that directs rewinding of the snapshot file. */ + s_rsle(&io___7); + do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical)); + e_rsle(); + rewi = rewi && trace; +/* Read the flag that directs stopping on any failure. */ + s_rsle(&io___9); + do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical)); + e_rsle(); +/* Read the flag that indicates whether error exits are to be tested. */ + s_rsle(&io___11); + do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical)); + e_rsle(); +/* Read the flag that indicates whether row-major data layout to be tested. */ + s_rsle(&io___13); + do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer)); + e_rsle(); +/* Read the threshold value of the test ratio */ + s_rsle(&io___15); + do_lio(&c__5, &c__1, (char *)&thresh, (ftnlen)sizeof(doublereal)); + e_rsle(); + +/* Read and check the parameter values for the tests. */ + +/* Values of N */ + s_rsle(&io___17); + do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer)); + e_rsle(); + if (nidim < 1 || nidim > 9) { + s_wsfe(&io___19); + do_fio(&c__1, "N", (ftnlen)1); + do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___20); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); + } + e_rsle(); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { + s_wsfe(&io___23); + do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } +/* L10: */ + } +/* Values of ALPHA */ + s_rsle(&io___24); + do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer)); + e_rsle(); + if (nalf < 1 || nalf > 7) { + s_wsfe(&io___26); + do_fio(&c__1, "ALPHA", (ftnlen)5); + do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___27); + i__1 = nalf; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__7, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof( + doublecomplex)); + } + e_rsle(); +/* Values of BETA */ + s_rsle(&io___29); + do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer)); + e_rsle(); + if (nbet < 1 || nbet > 7) { + s_wsfe(&io___31); + do_fio(&c__1, "BETA", (ftnlen)4); + do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); + e_wsfe(); + goto L220; + } + s_rsle(&io___32); + i__1 = nbet; + for (i__ = 1; i__ <= i__1; ++i__) { + do_lio(&c__7, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof( + doublecomplex)); + } + e_rsle(); + +/* Report values of parameters. */ + + s_wsfe(&io___34); + e_wsfe(); + s_wsfe(&io___35); + i__1 = nidim; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); + } + e_wsfe(); + s_wsfe(&io___36); + i__1 = nalf; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(doublereal)); + } + e_wsfe(); + s_wsfe(&io___37); + i__1 = nbet; + for (i__ = 1; i__ <= i__1; ++i__) { + do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(doublereal)); + } + e_wsfe(); + if (! tsterr) { + s_wsle(&io___38); + e_wsle(); + s_wsfe(&io___39); + e_wsfe(); + } + s_wsle(&io___40); + e_wsle(); + s_wsfe(&io___41); + do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(doublereal)); + e_wsfe(); + s_wsle(&io___42); + e_wsle(); + rorder = FALSE_; + corder = FALSE_; + if (layout == 2) { + rorder = TRUE_; + corder = TRUE_; + s_wsfe(&io___45); + e_wsfe(); + } else if (layout == 1) { + rorder = TRUE_; + s_wsfe(&io___46); + e_wsfe(); + } else if (layout == 0) { + corder = TRUE_; + s_wsfe(&io___47); + e_wsfe(); + } + s_wsle(&io___48); + e_wsle(); + +/* Read names of subroutines and flags which indicate */ +/* whether they are to be tested. */ + + for (i__ = 1; i__ <= 9; ++i__) { + ltest[i__ - 1] = FALSE_; +/* L20: */ + } +L30: + i__1 = s_rsfe(&io___50); + if (i__1 != 0) { + goto L60; + } + i__1 = do_fio(&c__1, snamet, (ftnlen)13); + if (i__1 != 0) { + goto L60; + } + i__1 = do_fio(&c__1, (char *)<estt, (ftnlen)sizeof(logical)); + if (i__1 != 0) { + goto L60; + } + i__1 = e_rsfe(); + if (i__1 != 0) { + goto L60; + } + for (i__ = 1; i__ <= 9; ++i__) { + if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == + 0) { + goto L50; + } +/* L40: */ + } + s_wsfe(&io___53); + do_fio(&c__1, snamet, (ftnlen)13); + e_wsfe(); + s_stop("", (ftnlen)0); +L50: + ltest[i__ - 1] = ltestt; + goto L30; + +L60: + cl__1.cerr = 0; + cl__1.cunit = 5; + cl__1.csta = 0; + f_clos(&cl__1); + +/* Compute EPS (the machine precision). */ + + eps = 1.; +L70: + d__1 = eps + 1.; + if (ddiff_(&d__1, &c_b92) == 0.) { + goto L80; + } + eps *= .5; + goto L70; +L80: + eps += eps; + s_wsfe(&io___55); + do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(doublereal)); + e_wsfe(); + +/* Check the reliability of ZMMCH using exact data. */ + + n = 32; + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * 65 - 66; +/* Computing MAX */ + i__5 = i__ - j + 1; + i__4 = f2cmax(i__5,0); + ab[i__3].r = (doublereal) i__4, ab[i__3].i = 0.; +/* L90: */ + } + i__2 = j + 4224; + ab[i__2].r = (doublereal) j, ab[i__2].i = 0.; + i__2 = (j + 65) * 65 - 65; + ab[i__2].r = (doublereal) j, ab[i__2].i = 0.; + i__2 = j - 1; + c__[i__2].r = 0., c__[i__2].i = 0.; +/* L100: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j - 1; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.; +/* L110: */ + } +/* CC holds the exact result. On exit from ZMMCH CT holds */ +/* the result computed by ZMMCH. */ + *(unsigned char *)transa = 'N'; + *(unsigned char *)transb = 'N'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + s_wsfe(&io___68); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + *(unsigned char *)transb = 'C'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + s_wsfe(&io___69); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = j + 4224; + i__3 = n - j + 1; + ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.; + i__2 = (j + 65) * 65 - 65; + i__3 = n - j + 1; + ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.; +/* L120: */ + } + i__1 = n; + for (j = 1; j <= i__1; ++j) { + i__2 = n - j; + i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3; + cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.; +/* L130: */ + } + *(unsigned char *)transa = 'C'; + *(unsigned char *)transb = 'N'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + s_wsfe(&io___70); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + *(unsigned char *)transb = 'C'; + zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & + c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, + &c__6, &c_true); + same = lze_(cc, ct, &n); + if (! same || err != 0.) { + s_wsfe(&io___71); + do_fio(&c__1, transa, (ftnlen)1); + do_fio(&c__1, transb, (ftnlen)1); + do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); + do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); + e_wsfe(); + s_stop("", (ftnlen)0); + } + +/* Test each subroutine in turn. */ + + for (isnum = 1; isnum <= 9; ++isnum) { + s_wsle(&io___73); + e_wsle(); + if (! ltest[isnum - 1]) { +/* Subprogram is not to be tested. */ + s_wsfe(&io___74); + do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13); + e_wsfe(); + } else { + s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, ( + ftnlen)13); +/* Test error exits. */ + if (tsterr) { + cz3chke_(snames + (isnum - 1) * 13); + s_wsle(&io___75); + e_wsle(); + } +/* Test computations. */ + infoc_1.infot = 0; + infoc_1.ok = TRUE_; + fatal = FALSE_; + switch (isnum) { + case 1: goto L140; + case 2: goto L150; + case 3: goto L150; + case 4: goto L160; + case 5: goto L160; + case 6: goto L170; + case 7: goto L170; + case 8: goto L180; + case 9: goto L180; + } +/* Test ZGEMM, 01. */ +L140: + if (corder) { + zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test ZHEMM, 02, ZSYMM, 03. */ +L150: + if (corder) { + zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test ZTRMM, 04, ZTRSM, 05. */ +L160: + if (corder) { + zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__0); + } + if (rorder) { + zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & + c__1); + } + goto L190; +/* Test ZHERK, 06, ZSYRK, 07. */ +L170: + if (corder) { + zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__0); + } + if (rorder) { + zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, + cc, cs, ct, g, &c__1); + } + goto L190; +/* Test ZHER2K, 08, ZSYR2K, 09. */ +L180: + if (corder) { + zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__0); + } + if (rorder) { + zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & + nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, + ct, g, w, &c__1); + } + goto L190; + +L190: + if (fatal && sfatal) { + goto L210; + } + } +/* L200: */ + } + s_wsfe(&io___82); + e_wsfe(); + goto L230; + +L210: + s_wsfe(&io___83); + e_wsfe(); + goto L230; + +L220: + s_wsfe(&io___84); + e_wsfe(); + +L230: + if (trace) { + cl__1.cerr = 0; + cl__1.cunit = ntra; + cl__1.csta = 0; + f_clos(&cl__1); + } + cl__1.cerr = 0; + cl__1.cunit = 6; + cl__1.csta = 0; + f_clos(&cl__1); + s_stop("", (ftnlen)0); + + +/* End of ZBLAT3. */ + + return 0; +} /* MAIN__ */ + +/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, + integer *nout, integer *ntra, logical *trace, logical *rewi, logical * + fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * + alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * + g, integer *iorder) +{ + /* Initialized data */ + + static char ich[3] = "NTC"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7, i__8; + alist al__1; + + /* Local variables */ + extern /* Subroutine */ int czgemm3m_(integer *, char *, char *, integer * + , integer *, integer *, doublecomplex *, doublecomplex *, integer + *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + doublecomplex beta; + integer ldas, ldbs, ldcs; + logical same, null; + integer i__, k, m, n; + doublecomplex alpha; + logical isame[13], trana, tranb; + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + logical *, doublecomplex *); + integer nargs; + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *); + logical reset; + integer ia, ib; + extern /* Subroutine */ int zprcn1_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, integer *, doublecomplex + *, integer *, integer *, doublecomplex *, integer *); + integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; + char tranas[1], tranbs[1], transa[1], transb[1]; + doublereal errmax; + extern logical lzeres_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *); + integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; + doublecomplex als, bls; + doublereal err; + extern logical lze_(doublecomplex *, doublecomplex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___128 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___131 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___133 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___134 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___135 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___136 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___137 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests ZGEMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 13; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; + + i__3 = *nidim; + for (ik = 1; ik <= i__3; ++ik) { + k = idim[ik]; + + for (ica = 1; ica <= 3; ++ica) { + *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1] + ; + trana = *(unsigned char *)transa == 'T' || *(unsigned + char *)transa == 'C'; + + if (trana) { + ma = k; + na = m; + } else { + ma = m; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[ + 1], &lda, &reset, &c_b1); + + for (icb = 1; icb <= 3; ++icb) { + *(unsigned char *)transb = *(unsigned char *)&ich[icb + - 1]; + tranb = *(unsigned char *)transb == 'T' || *(unsigned + char *)transb == 'C'; + + if (tranb) { + mb = n; + nb = k; + } else { + mb = k; + nb = n; + } +/* Set LDB to 1 more than minimum value if room. */ + ldb = mb; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L70; + } + lbb = ldb * nb; + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, & + bb[1], &ldb, &reset, &c_b1); + + i__4 = *nalf; + for (ia = 1; ia <= i__4; ++ia) { + i__5 = ia; + alpha.r = alf[i__5].r, alpha.i = alf[i__5].i; + + i__5 = *nbet; + for (ib = 1; ib <= i__5; ++ib) { + i__6 = ib; + beta.r = bet[i__6].r, beta.i = bet[i__6].i; + +/* Generate the matrix C. */ + + zmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)tranbs = *(unsigned char *) + transb; + ms = m; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__6 = laa; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + as[i__7].r = aa[i__8].r, as[i__7].i = aa[ + i__8].i; +/* L10: */ + } + ldas = lda; + i__6 = lbb; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[ + i__8].i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__6 = lcc; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = i__; + cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[ + i__8].i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + zprcn1_(ntra, &nc, sname, iorder, transa, + transb, &m, &n, &k, &alpha, &lda, + &ldb, &beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + czgemm3m_(iorder, transa, transb, &m, &n, &k, + &alpha, &aa[1], &lda, &bb[1], &ldb, & + beta, &cc[1], &ldc); + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___128.ciunit = *nout; + s_wsfe(&io___128); + e_wsfe(); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)transa == *( + unsigned char *)tranas; + isame[1] = *(unsigned char *)transb == *( + unsigned char *)tranbs; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = ks == k; + isame[5] = als.r == alpha.r && als.i == + alpha.i; + isame[6] = lze_(&as[1], &aa[1], &laa); + isame[7] = ldas == lda; + isame[8] = lze_(&bs[1], &bb[1], &lbb); + isame[9] = ldbs == ldb; + isame[10] = bls.r == beta.r && bls.i == + beta.i; + if (null) { + isame[11] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[11] = lzeres_("ge", " ", &m, &n, & + cs[1], &cc[1], &ldc); + } + isame[12] = ldcs == ldc; + +/* If data was incorrectly changed, report */ +/* and return. */ + + same = TRUE_; + i__6 = nargs; + for (i__ = 1; i__ <= i__6; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___131.ciunit = *nout; + s_wsfe(&io___131); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result. */ + + zmmch_(transa, transb, &m, &n, &k, &alpha, + &a[a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], + nmax, &ct[1], &g[1], &cc[1], &ldc, + eps, &err, fatal, nout, &c_true); + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L120; + } + } + +/* L50: */ + } + +/* L60: */ + } + +L70: + ; + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* L110: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___133.ciunit = *nout; + s_wsfe(&io___133); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___134.ciunit = *nout; + s_wsfe(&io___134); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___135.ciunit = *nout; + s_wsfe(&io___135); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + if (*iorder == 1) { + io___136.ciunit = *nout; + s_wsfe(&io___136); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + } + goto L130; + +L120: + io___137.ciunit = *nout; + s_wsfe(&io___137); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + zprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & + lda, &ldb, &beta, &ldc); + +L130: + return 0; + +/* L9995: */ + +/* End of ZCHK1. */ + +} /* zchk1_ */ + + +/* Subroutine */ int zprcn1_(integer *nout, integer *nc, char *sname, integer + *iorder, char *transa, char *transb, integer *m, integer *n, integer * + k, doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex * + beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002" + ",\002,f4.1,\002) , C,\002,i3,\002).\002)"; + + /* Local variables */ + char crc[14], cta[14], ctb[14]; + + /* Fortran I/O blocks */ + static cilist io___141 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___142 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)transa == 'N') { + s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(cta, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transb == 'N') { + s_copy(ctb, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transb == 'T') { + s_copy(ctb, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___141.ciunit = *nout; + s_wsfe(&io___141); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cta, (ftnlen)14); + do_fio(&c__1, ctb, (ftnlen)14); + e_wsfe(); + io___142.ciunit = *nout; + s_wsfe(&io___142); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn1_ */ + + +/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, + integer *nout, integer *ntra, logical *trace, logical *rewi, logical * + fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * + alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * + g, integer *iorder) +{ + /* Initialized data */ + + static char ichs[2] = "LR"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + alist al__1; + + /* Local variables */ + doublecomplex beta; + integer ldas, ldbs, ldcs; + logical same; + char side[1]; + logical conj, left, null; + char uplo[1]; + integer i__, m, n; + doublecomplex alpha; + logical isame[13]; + char sides[1]; + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + logical *, doublecomplex *); + integer nargs; + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *); + logical reset; + char uplos[1]; + integer ia, ib; + extern /* Subroutine */ int zprcn2_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, doublecomplex *, integer + *, integer *, doublecomplex *, integer *); + integer na, nc, im, in, ms, ns; + extern /* Subroutine */ int czhemm_(integer *, char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + doublereal errmax; + extern logical lzeres_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *); + extern /* Subroutine */ int czsymm_(integer *, char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + integer laa, lbb, lda, lcc, ldb, ldc, ics; + doublecomplex als, bls; + integer icu; + doublereal err; + extern logical lze_(doublecomplex *, doublecomplex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___181 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___184 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___186 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___187 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___188 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___189 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___190 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests ZHEMM and ZSYMM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = m; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L90; + } + lcc = ldc * n; + null = n <= 0 || m <= 0; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L90; + } + lbb = ldb * n; + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, & + reset, &c_b1); + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + +/* Generate the hermitian or symmetric matrix A. */ + + zmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax, + &aa[1], &lda, &reset, &c_b1); + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + +/* Generate the matrix C. */ + + zmake_("ge", " ", " ", &m, &n, &c__[c_offset], + nmax, &cc[1], &ldc, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *)side; + *(unsigned char *)uplos = *(unsigned char *)uplo; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + bls.r = beta.r, bls.i = beta.i; + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (*trace) { + zprcn2_(ntra, &nc, sname, iorder, side, uplo, + &m, &n, &alpha, &lda, &ldb, &beta, & + ldc) + ; + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + if (conj) { + czhemm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } else { + czsymm_(iorder, side, uplo, &m, &n, &alpha, & + aa[1], &lda, &bb[1], &ldb, &beta, &cc[ + 1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___181.ciunit = *nout; + s_wsfe(&io___181); + e_wsfe(); + *fatal = TRUE_; + goto L110; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *(unsigned + char *)side; + isame[1] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[2] = ms == m; + isame[3] = ns == n; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lze_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + isame[9] = bls.r == beta.r && bls.i == beta.i; + if (null) { + isame[10] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lzeres_("ge", " ", &m, &n, &cs[1], + &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___184.ciunit = *nout; + s_wsfe(&io___184); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L110; + } + + if (! null) { + +/* Check the result. */ + + if (left) { + zmmch_("N", "N", &m, &n, &m, &alpha, &a[ + a_offset], nmax, &b[b_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } else { + zmmch_("N", "N", &m, &n, &n, &alpha, &b[ + b_offset], nmax, &a[a_offset], + nmax, &beta, &c__[c_offset], nmax, + &ct[1], &g[1], &cc[1], &ldc, eps, + &err, fatal, nout, &c_true); + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +L90: + ; + } + +/* L100: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___186.ciunit = *nout; + s_wsfe(&io___186); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___187.ciunit = *nout; + s_wsfe(&io___187); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___188.ciunit = *nout; + s_wsfe(&io___188); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + if (*iorder == 1) { + io___189.ciunit = *nout; + s_wsfe(&io___189); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + } + goto L120; + +L110: + io___190.ciunit = *nout; + s_wsfe(&io___190); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + zprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, + &beta, &ldc); + +L120: + return 0; + +/* L9995: */ + +/* End of ZCHK2. */ + +} /* zchk2_ */ + + +/* Subroutine */ int zprcn2_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, integer *m, integer *n, + doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta, + integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002," + "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)"; + + /* Local variables */ + char cs[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___194 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___195 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___194.ciunit = *nout; + s_wsfe(&io___194); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cs, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + e_wsfe(); + io___195.ciunit = *nout; + s_wsfe(&io___195); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn2_ */ + + +/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, + integer *nout, integer *ntra, logical *trace, logical *rewi, logical * + fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * + alf, integer *nmax, doublecomplex *a, doublecomplex *aa, + doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex + *bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, integer * + iorder) +{ + /* Initialized data */ + + static char ichu[2] = "UL"; + static char icht[3] = "NTC"; + static char ichd[2] = "UN"; + static char ichs[2] = "LR"; + + /* Format strings */ + static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + doublecomplex z__1; + alist al__1; + + /* Local variables */ + char diag[1]; + integer ldas, ldbs; + logical same; + char side[1]; + logical left, null; + char uplo[1]; + integer i__, j, m, n; + doublecomplex alpha; + char diags[1]; + logical isame[13]; + char sides[1]; + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + logical *, doublecomplex *); + integer nargs; + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *); + logical reset; + char uplos[1]; + integer ia, na; + extern /* Subroutine */ int zprcn3_(integer *, integer *, char *, integer + *, char *, char *, char *, char *, integer *, integer *, + doublecomplex *, integer *, integer *); + integer nc, im, in, ms, ns; + char tranas[1], transa[1]; + doublereal errmax; + extern logical lzeres_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *); + extern /* Subroutine */ int cztrmm_(integer *, char *, char *, char *, + char *, integer *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, integer *), cztrsm_(integer *, char *, char *, char *, char *, + integer *, integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *); + integer laa, icd, lbb, lda, ldb, ics; + doublecomplex als; + integer ict, icu; + doublereal err; + extern logical lze_(doublecomplex *, doublecomplex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___236 = { 0, 0, 0, fmt_9994, 0 }; + static cilist io___239 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___241 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___242 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___243 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___244 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___245 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests ZTRMM and ZTRSM. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --g; + --ct; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + + nargs = 11; + nc = 0; + reset = TRUE_; + errmax = 0.; +/* Set up zero matrix for ZMMCH. */ + i__1 = *nmax; + for (j = 1; j <= i__1; ++j) { + i__2 = *nmax; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * c_dim1; + c__[i__3].r = 0., c__[i__3].i = 0.; +/* L10: */ + } +/* L20: */ + } + + i__1 = *nidim; + for (im = 1; im <= i__1; ++im) { + m = idim[im]; + + i__2 = *nidim; + for (in = 1; in <= i__2; ++in) { + n = idim[in]; +/* Set LDB to 1 more than minimum value if room. */ + ldb = m; + if (ldb < *nmax) { + ++ldb; + } +/* Skip tests if not enough room. */ + if (ldb > *nmax) { + goto L130; + } + lbb = ldb * n; + null = m <= 0 || n <= 0; + + for (ics = 1; ics <= 2; ++ics) { + *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; + left = *(unsigned char *)side == 'L'; + if (left) { + na = m; + } else { + na = n; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = na; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L130; + } + laa = lda * na; + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + + for (ict = 1; ict <= 3; ++ict) { + *(unsigned char *)transa = *(unsigned char *)&icht[ + ict - 1]; + + for (icd = 1; icd <= 2; ++icd) { + *(unsigned char *)diag = *(unsigned char *)&ichd[ + icd - 1]; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + +/* Generate the matrix A. */ + + zmake_("tr", uplo, diag, &na, &na, &a[ + a_offset], nmax, &aa[1], &lda, &reset, + &c_b1); + +/* Generate the matrix B. */ + + zmake_("ge", " ", " ", &m, &n, &b[b_offset], + nmax, &bb[1], &ldb, &reset, &c_b1); + + ++nc; + +/* Save every datum before calling the */ +/* subroutine. */ + + *(unsigned char *)sides = *(unsigned char *) + side; + *(unsigned char *)uplos = *(unsigned char *) + uplo; + *(unsigned char *)tranas = *(unsigned char *) + transa; + *(unsigned char *)diags = *(unsigned char *) + diag; + ms = m; + ns = n; + als.r = alpha.r, als.i = alpha.i; + i__4 = laa; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + as[i__5].r = aa[i__6].r, as[i__5].i = aa[ + i__6].i; +/* L30: */ + } + ldas = lda; + i__4 = lbb; + for (i__ = 1; i__ <= i__4; ++i__) { + i__5 = i__; + i__6 = i__; + bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[ + i__6].i; +/* L40: */ + } + ldbs = ldb; + +/* Call the subroutine. */ + + if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen) + 2) == 0) { + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)13, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + cztrmm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } else if (s_cmp(sname + 9, "sm", (ftnlen)2, ( + ftnlen)2) == 0) { + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, + side, uplo, transa, diag, &m, + &n, &alpha, &lda, &ldb, ( + ftnlen)13, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + cztrsm_(iorder, side, uplo, transa, diag, + &m, &n, &alpha, &aa[1], &lda, &bb[ + 1], &ldb); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___236.ciunit = *nout; + s_wsfe(&io___236); + e_wsfe(); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)sides == *( + unsigned char *)side; + isame[1] = *(unsigned char *)uplos == *( + unsigned char *)uplo; + isame[2] = *(unsigned char *)tranas == *( + unsigned char *)transa; + isame[3] = *(unsigned char *)diags == *( + unsigned char *)diag; + isame[4] = ms == m; + isame[5] = ns == n; + isame[6] = als.r == alpha.r && als.i == + alpha.i; + isame[7] = lze_(&as[1], &aa[1], &laa); + isame[8] = ldas == lda; + if (null) { + isame[9] = lze_(&bs[1], &bb[1], &lbb); + } else { + isame[9] = lzeres_("ge", " ", &m, &n, &bs[ + 1], &bb[1], &ldb); + } + isame[10] = ldbs == ldb; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__4 = nargs; + for (i__ = 1; i__ <= i__4; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___239.ciunit = *nout; + s_wsfe(&io___239); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L50: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + if (s_cmp(sname + 9, "mm", (ftnlen)2, ( + ftnlen)2) == 0) { + +/* Check the result. */ + + if (left) { + zmmch_(transa, "N", &m, &n, &m, & + alpha, &a[a_offset], nmax, + &b[b_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); + } else { + zmmch_("N", transa, &m, &n, &n, & + alpha, &b[b_offset], nmax, + &a[a_offset], nmax, & + c_b1, &c__[c_offset], + nmax, &ct[1], &g[1], &bb[ + 1], &ldb, eps, &err, + fatal, nout, &c_true); + } + } else if (s_cmp(sname + 9, "sm", (ftnlen) + 2, (ftnlen)2) == 0) { + +/* Compute approximation to original */ +/* matrix. */ + + i__4 = n; + for (j = 1; j <= i__4; ++j) { + i__5 = m; + for (i__ = 1; i__ <= i__5; ++i__) + { + i__6 = i__ + j * c_dim1; + i__7 = i__ + (j - 1) * ldb; + c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i; + i__6 = i__ + (j - 1) * ldb; + i__7 = i__ + j * b_dim1; + z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, + z__1.i = alpha.r * b[i__7].i + alpha.i * b[ + i__7].r; + bb[i__6].r = z__1.r, bb[i__6].i = z__1.i; +/* L60: */ + } +/* L70: */ + } + + if (left) { + zmmch_(transa, "N", &m, &n, &m, & + c_b2, &a[a_offset], nmax, + &c__[c_offset], nmax, & + c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } else { + zmmch_("N", transa, &m, &n, &n, & + c_b2, &c__[c_offset], + nmax, &a[a_offset], nmax, + &c_b1, &b[b_offset], nmax, + &ct[1], &g[1], &bb[1], & + ldb, eps, &err, fatal, + nout, &c_false); + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L150; + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +/* L110: */ + } + +/* L120: */ + } + +L130: + ; + } + +/* L140: */ + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___241.ciunit = *nout; + s_wsfe(&io___241); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___242.ciunit = *nout; + s_wsfe(&io___242); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___243.ciunit = *nout; + s_wsfe(&io___243); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + if (*iorder == 1) { + io___244.ciunit = *nout; + s_wsfe(&io___244); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + } + goto L160; + +L150: + io___245.ciunit = *nout; + s_wsfe(&io___245); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (*trace) { + zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & + alpha, &lda, &ldb); + } + +L160: + return 0; + +/* L9995: */ + +/* End of ZCHK3. */ + +} /* zchk3_ */ + + +/* Subroutine */ int zprcn3_(integer *nout, integer *nc, char *sname, integer + *iorder, char *side, char *uplo, char *transa, char *diag, integer *m, + integer *n, doublecomplex *alpha, integer *lda, integer *ldb) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," + "a14,\002,\002,a14,\002,\002)"; + static char fmt_9994[] = "(10x,2(a15,\002,\002),2(i3,\002,\002),\002 " + "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)." + "\002)"; + + /* Local variables */ + char ca[14], cd[14], cs[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___251 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___252 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)side == 'L') { + s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cs, " CblasRight", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)diag == 'N') { + s_copy(cd, " CblasNonUnit", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cd, " CblasUnit", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___251.ciunit = *nout; + s_wsfe(&io___251); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cs, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + e_wsfe(); + io___252.ciunit = *nout; + s_wsfe(&io___252); + do_fio(&c__1, ca, (ftnlen)14); + do_fio(&c__1, cd, (ftnlen)14); + do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn3_ */ + + +/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, + integer *nout, integer *ntra, logical *trace, logical *rewi, logical * + fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * + alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * + a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, + doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, + doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * + g, integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4, i__5, i__6, i__7; + doublecomplex z__1; + alist al__1; + + /* Local variables */ + doublecomplex beta; + integer ldas, ldcs; + logical same, conj; + doublecomplex bets; + doublereal rals; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + doublecomplex alpha; + doublereal rbeta; + logical isame[13]; + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + logical *, doublecomplex *); + integer nargs; + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *); + doublereal rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + integer ia, ib, jc, ma, na; + extern /* Subroutine */ int zprcn4_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, doublecomplex *, integer + *, doublecomplex *, integer *); + integer nc; + extern /* Subroutine */ int zprcn6_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, doublereal *, integer *, + doublereal *, integer *); + integer ik, in, jj, lj, ks, ns; + doublereal ralpha; + extern /* Subroutine */ int czherk_(integer *, char *, char *, integer *, + integer *, doublereal *, doublecomplex *, integer *, doublereal *, + doublecomplex *, integer *); + doublereal errmax; + extern logical lzeres_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *); + char transs[1], transt[1]; + extern /* Subroutine */ int czsyrk_(integer *, char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, doublecomplex *, integer *); + integer laa, lda, lcc, ldc; + doublecomplex als; + integer ict, icu; + doublereal err; + extern logical lze_(doublecomplex *, doublecomplex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___294 = { 0, 0, 0, fmt_9992, 0 }; + static cilist io___297 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___304 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___305 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___306 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___307 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___308 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___309 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests ZHERK and ZSYRK. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + b_dim1 = *nmax; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --as; + --aa; + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 10; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L100; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L80; + } + laa = lda * na; + +/* Generate the matrix A. */ + + zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], & + lda, &reset, &c_b1); + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + if (conj) { + ralpha = alpha.r; + z__1.r = ralpha, z__1.i = 0.; + alpha.r = z__1.r, alpha.i = z__1.i; + } + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + z__1.r = rbeta, z__1.i = 0.; + beta.r = z__1.r, beta.i = z__1.i; + } + null = n <= 0; + if (conj) { + null = null || (k <= 0 || ralpha == 0.) && + rbeta == 1.; + } + +/* Generate the matrix C. */ + + zmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + if (conj) { + rals = ralpha; + } else { + als.r = alpha.r, als.i = alpha.i; + } + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L20: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + zprcn6_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &ralpha, &lda, & + rbeta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + czherk_(iorder, uplo, trans, &n, &k, &ralpha, + &aa[1], &lda, &rbeta, &cc[1], &ldc); + } else { + if (*trace) { + zprcn4_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + czsyrk_(iorder, uplo, trans, &n, &k, &alpha, & + aa[1], &lda, &beta, &cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___294.ciunit = *nout; + s_wsfe(&io___294); + e_wsfe(); + *fatal = TRUE_; + goto L120; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + if (conj) { + isame[4] = rals == ralpha; + } else { + isame[4] = als.r == alpha.r && als.i == + alpha.i; + } + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + if (conj) { + isame[7] = rbets == rbeta; + } else { + isame[7] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[8] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[8] = lzeres_(sname + 7, uplo, &n, &n, & + cs[1], &cc[1], &ldc); + } + isame[9] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___297.ciunit = *nout; + s_wsfe(&io___297); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L30: */ + } + if (! same) { + *fatal = TRUE_; + goto L120; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + zmmch_(transt, "N", &lj, &c__1, &k, & + alpha, &a[jj * a_dim1 + 1], + nmax, &a[j * a_dim1 + 1], + nmax, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + zmmch_("N", transt, &lj, &c__1, &k, & + alpha, &a[jj + a_dim1], nmax, + &a[j + a_dim1], nmax, &beta, & + c__[jj + j * c_dim1], nmax, & + ct[1], &g[1], &cc[jc], &ldc, + eps, &err, fatal, nout, & + c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L110; + } +/* L40: */ + } + } + +/* L50: */ + } + +/* L60: */ + } + +/* L70: */ + } + +L80: + ; + } + +/* L90: */ + } + +L100: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___304.ciunit = *nout; + s_wsfe(&io___304); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___305.ciunit = *nout; + s_wsfe(&io___305); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___306.ciunit = *nout; + s_wsfe(&io___306); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + if (*iorder == 1) { + io___307.ciunit = *nout; + s_wsfe(&io___307); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + } + goto L130; + +L110: + if (n > 1) { + io___308.ciunit = *nout; + s_wsfe(&io___308); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L120: + io___309.ciunit = *nout; + s_wsfe(&io___309); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (conj) { + zprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, + &rbeta, &ldc); + } else { + zprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + beta, &ldc); + } + +L130: + return 0; + +/* L9994: */ +/* L9993: */ + +/* End of CCHK4. */ + +} /* zchk4_ */ + + +/* Subroutine */ int zprcn4_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, + doublecomplex *alpha, integer *lda, doublecomplex *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C" + ",\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___313 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___314 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___313.ciunit = *nout; + s_wsfe(&io___313); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___314.ciunit = *nout; + s_wsfe(&io___314); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn4_ */ + + + +/* Subroutine */ int zprcn6_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, doublereal + *alpha, integer *lda, doublereal *beta, integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3" + ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___318 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___319 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___318.ciunit = *nout; + s_wsfe(&io___318); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___319.ciunit = *nout; + s_wsfe(&io___319); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn6_ */ + + +/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, + integer *nout, integer *ntra, logical *trace, logical *rewi, logical * + fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * + alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * + ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, + doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, + doublecomplex *cs, doublecomplex *ct, doublereal *g, doublecomplex *w, + integer *iorder) +{ + /* Initialized data */ + + static char icht[2] = "NC"; + static char ichu[2] = "UL"; + + /* Format strings */ + static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " + "TAKEN ON VALID\002,\002 CALL *******\002)"; + static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" + " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; + static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" + "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; + static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" + "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " + " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" + "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" + "ECT *******\002)"; + static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" + "BER:\002)"; + + /* System generated locals */ + integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; + doublecomplex z__1, z__2; + alist al__1; + + /* Local variables */ + integer jjab; + doublecomplex beta; + integer ldas, ldbs, ldcs; + logical same, conj; + doublecomplex bets; + logical tran, null; + char uplo[1]; + integer i__, j, k, n; + doublecomplex alpha; + doublereal rbeta; + logical isame[13]; + extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + logical *, doublecomplex *); + integer nargs; + extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *, doublereal *, doublecomplex *, + integer *, doublereal *, doublereal *, logical *, integer *, + logical *); + doublereal rbets; + logical reset; + char trans[1]; + logical upper; + char uplos[1]; + integer ia, ib, jc, ma, na, nc; + extern /* Subroutine */ int zprcn5_(integer *, integer *, char *, integer + *, char *, char *, integer *, integer *, doublecomplex *, integer + *, integer *, doublecomplex *, integer *), + zprcn7_(integer *, integer *, char *, integer *, char *, char *, + integer *, integer *, doublecomplex *, integer *, integer *, + doublereal *, integer *); + integer ik, in, jj, lj, ks, ns; + doublereal errmax; + extern logical lzeres_(char *, char *, integer *, integer *, + doublecomplex *, doublecomplex *, integer *); + char transs[1], transt[1]; + extern /* Subroutine */ int czher2k_(integer *, char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublereal *, doublecomplex *, + integer *); + integer laa, lbb, lda, lcc, ldb, ldc; + doublecomplex als; + integer ict, icu; + extern /* Subroutine */ int czsyr2k_(integer *, char *, char *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + doublereal err; + extern logical lze_(doublecomplex *, doublecomplex *, integer *); + + /* Fortran I/O blocks */ + static cilist io___362 = { 0, 0, 0, fmt_9992, 0 }; + static cilist io___365 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___373 = { 0, 0, 0, fmt_10000, 0 }; + static cilist io___374 = { 0, 0, 0, fmt_10001, 0 }; + static cilist io___375 = { 0, 0, 0, fmt_10002, 0 }; + static cilist io___376 = { 0, 0, 0, fmt_10003, 0 }; + static cilist io___377 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___378 = { 0, 0, 0, fmt_9996, 0 }; + + + +/* Tests ZHER2K and ZSYR2K. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --idim; + --alf; + --bet; + --w; + --g; + --ct; + --cs; + --cc; + c_dim1 = *nmax; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --bs; + --bb; + --as; + --aa; + --ab; + + /* Function Body */ + conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; + + nargs = 12; + nc = 0; + reset = TRUE_; + errmax = 0.; + + i__1 = *nidim; + for (in = 1; in <= i__1; ++in) { + n = idim[in]; +/* Set LDC to 1 more than minimum value if room. */ + ldc = n; + if (ldc < *nmax) { + ++ldc; + } +/* Skip tests if not enough room. */ + if (ldc > *nmax) { + goto L130; + } + lcc = ldc * n; + + i__2 = *nidim; + for (ik = 1; ik <= i__2; ++ik) { + k = idim[ik]; + + for (ict = 1; ict <= 2; ++ict) { + *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; + tran = *(unsigned char *)trans == 'C'; + if (tran && ! conj) { + *(unsigned char *)trans = 'T'; + } + if (tran) { + ma = k; + na = n; + } else { + ma = n; + na = k; + } +/* Set LDA to 1 more than minimum value if room. */ + lda = ma; + if (lda < *nmax) { + ++lda; + } +/* Skip tests if not enough room. */ + if (lda > *nmax) { + goto L110; + } + laa = lda * na; + +/* Generate the matrix A. */ + + if (tran) { + i__3 = *nmax << 1; + zmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], & + lda, &reset, &c_b1); + } else { + zmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], & + lda, &reset, &c_b1); + } + +/* Generate the matrix B. */ + + ldb = lda; + lbb = laa; + if (tran) { + i__3 = *nmax << 1; + zmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1] + , &ldb, &reset, &c_b1); + } else { + zmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax, + &bb[1], &ldb, &reset, &c_b1); + } + + for (icu = 1; icu <= 2; ++icu) { + *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; + upper = *(unsigned char *)uplo == 'U'; + + i__3 = *nalf; + for (ia = 1; ia <= i__3; ++ia) { + i__4 = ia; + alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; + + i__4 = *nbet; + for (ib = 1; ib <= i__4; ++ib) { + i__5 = ib; + beta.r = bet[i__5].r, beta.i = bet[i__5].i; + if (conj) { + rbeta = beta.r; + z__1.r = rbeta, z__1.i = 0.; + beta.r = z__1.r, beta.i = z__1.i; + } + null = n <= 0; + if (conj) { + null = null || (k <= 0 || alpha.r == 0. && + alpha.i == 0.) && rbeta == 1.; + } + +/* Generate the matrix C. */ + + zmake_(sname + 7, uplo, " ", &n, &n, &c__[ + c_offset], nmax, &cc[1], &ldc, &reset, & + c_b1); + + ++nc; + +/* Save every datum before calling the subroutine. */ + + *(unsigned char *)uplos = *(unsigned char *)uplo; + *(unsigned char *)transs = *(unsigned char *) + trans; + ns = n; + ks = k; + als.r = alpha.r, als.i = alpha.i; + i__5 = laa; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7] + .i; +/* L10: */ + } + ldas = lda; + i__5 = lbb; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7] + .i; +/* L20: */ + } + ldbs = ldb; + if (conj) { + rbets = rbeta; + } else { + bets.r = beta.r, bets.i = beta.i; + } + i__5 = lcc; + for (i__ = 1; i__ <= i__5; ++i__) { + i__6 = i__; + i__7 = i__; + cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7] + .i; +/* L30: */ + } + ldcs = ldc; + +/* Call the subroutine. */ + + if (conj) { + if (*trace) { + zprcn7_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &rbeta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + czher2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &rbeta, & + cc[1], &ldc); + } else { + if (*trace) { + zprcn5_(ntra, &nc, sname, iorder, uplo, + trans, &n, &k, &alpha, &lda, &ldb, + &beta, &ldc); + } + if (*rewi) { + al__1.aerr = 0; + al__1.aunit = *ntra; + f_rew(&al__1); + } + czsyr2k_(iorder, uplo, trans, &n, &k, &alpha, + &aa[1], &lda, &bb[1], &ldb, &beta, & + cc[1], &ldc); + } + +/* Check if error-exit was taken incorrectly. */ + + if (! infoc_1.ok) { + io___362.ciunit = *nout; + s_wsfe(&io___362); + e_wsfe(); + *fatal = TRUE_; + goto L150; + } + +/* See what data changed inside subroutines. */ + + isame[0] = *(unsigned char *)uplos == *(unsigned + char *)uplo; + isame[1] = *(unsigned char *)transs == *(unsigned + char *)trans; + isame[2] = ns == n; + isame[3] = ks == k; + isame[4] = als.r == alpha.r && als.i == alpha.i; + isame[5] = lze_(&as[1], &aa[1], &laa); + isame[6] = ldas == lda; + isame[7] = lze_(&bs[1], &bb[1], &lbb); + isame[8] = ldbs == ldb; + if (conj) { + isame[9] = rbets == rbeta; + } else { + isame[9] = bets.r == beta.r && bets.i == + beta.i; + } + if (null) { + isame[10] = lze_(&cs[1], &cc[1], &lcc); + } else { + isame[10] = lzeres_("he", uplo, &n, &n, &cs[1] + , &cc[1], &ldc); + } + isame[11] = ldcs == ldc; + +/* If data was incorrectly changed, report and */ +/* return. */ + + same = TRUE_; + i__5 = nargs; + for (i__ = 1; i__ <= i__5; ++i__) { + same = same && isame[i__ - 1]; + if (! isame[i__ - 1]) { + io___365.ciunit = *nout; + s_wsfe(&io___365); + do_fio(&c__1, (char *)&i__, (ftnlen) + sizeof(integer)); + e_wsfe(); + } +/* L40: */ + } + if (! same) { + *fatal = TRUE_; + goto L150; + } + + if (! null) { + +/* Check the result column by column. */ + + if (conj) { + *(unsigned char *)transt = 'C'; + } else { + *(unsigned char *)transt = 'T'; + } + jjab = 1; + jc = 1; + i__5 = n; + for (j = 1; j <= i__5; ++j) { + if (upper) { + jj = 1; + lj = j; + } else { + jj = j; + lj = n - j + 1; + } + if (tran) { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + i__7 = i__; + i__8 = (j - 1 << 1) * *nmax + k + + i__; + z__1.r = alpha.r * ab[i__8].r - + alpha.i * ab[i__8].i, + z__1.i = alpha.r * ab[ + i__8].i + alpha.i * ab[ + i__8].r; + w[i__7].r = z__1.r, w[i__7].i = + z__1.i; + if (conj) { + i__7 = k + i__; + d_cnjg(&z__2, &alpha); + i__8 = (j - 1 << 1) * *nmax + i__; + z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, + z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[ + i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } else { + i__7 = k + i__; + i__8 = (j - 1 << 1) * *nmax + i__; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } +/* L50: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + i__8 = *nmax << 1; + zmmch_(transt, "N", &lj, &c__1, &i__6, + &c_b2, &ab[jjab], &i__7, &w[ + 1], &i__8, &beta, &c__[jj + j + * c_dim1], nmax, &ct[1], &g[1] + , &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } else { + i__6 = k; + for (i__ = 1; i__ <= i__6; ++i__) { + if (conj) { + i__7 = i__; + d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]); + z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, + z__1.i = alpha.r * z__2.i + alpha.i * + z__2.r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + z__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__2.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + d_cnjg(&z__1, &z__2); + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } else { + i__7 = i__; + i__8 = (k + i__ - 1) * *nmax + j; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + i__7 = k + i__; + i__8 = (i__ - 1) * *nmax + j; + z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] + .i, z__1.i = alpha.r * ab[i__8].i + alpha.i + * ab[i__8].r; + w[i__7].r = z__1.r, w[i__7].i = z__1.i; + } +/* L60: */ + } + i__6 = k << 1; + i__7 = *nmax << 1; + zmmch_("N", "N", &lj, &c__1, &i__6, & + c_b2, &ab[jj], nmax, &w[1], & + i__7, &beta, &c__[jj + j * + c_dim1], nmax, &ct[1], &g[1], + &cc[jc], &ldc, eps, &err, + fatal, nout, &c_true); + } + if (upper) { + jc += ldc; + } else { + jc = jc + ldc + 1; + if (tran) { + jjab += *nmax << 1; + } + } + errmax = f2cmax(errmax,err); +/* If got really bad answer, report and */ +/* return. */ + if (*fatal) { + goto L140; + } +/* L70: */ + } + } + +/* L80: */ + } + +/* L90: */ + } + +/* L100: */ + } + +L110: + ; + } + +/* L120: */ + } + +L130: + ; + } + +/* Report result. */ + + if (errmax < *thresh) { + if (*iorder == 0) { + io___373.ciunit = *nout; + s_wsfe(&io___373); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + if (*iorder == 1) { + io___374.ciunit = *nout; + s_wsfe(&io___374); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + e_wsfe(); + } + } else { + if (*iorder == 0) { + io___375.ciunit = *nout; + s_wsfe(&io___375); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + if (*iorder == 1) { + io___376.ciunit = *nout; + s_wsfe(&io___376); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); + e_wsfe(); + } + } + goto L160; + +L140: + if (n > 1) { + io___377.ciunit = *nout; + s_wsfe(&io___377); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L150: + io___378.ciunit = *nout; + s_wsfe(&io___378); + do_fio(&c__1, sname, (ftnlen)13); + e_wsfe(); + if (conj) { + zprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &rbeta, &ldc); + } else { + zprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & + ldb, &beta, &ldc); + } + +L160: + return 0; + +/* L9994: */ +/* L9993: */ + +/* End of ZCHK5. */ + +} /* zchk5_ */ + + +/* Subroutine */ int zprcn5_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, + doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta, + integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002" + ",f4.1,\002), C,\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___382 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___383 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___382.ciunit = *nout; + s_wsfe(&io___382); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___383.ciunit = *nout; + s_wsfe(&io___383); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn5_ */ + + + +/* Subroutine */ int zprcn7_(integer *nout, integer *nc, char *sname, integer + *iorder, char *uplo, char *transa, integer *n, integer *k, + doublecomplex *alpha, integer *lda, integer *ldb, doublereal *beta, + integer *ldc) +{ + /* Format strings */ + static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," + "\002))"; + static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" + ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C," + "\002,i3,\002).\002)"; + + /* Local variables */ + char ca[14], cu[14], crc[14]; + + /* Fortran I/O blocks */ + static cilist io___387 = { 0, 0, 0, fmt_9995, 0 }; + static cilist io___388 = { 0, 0, 0, fmt_9994, 0 }; + + + if (*(unsigned char *)uplo == 'U') { + s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); + } else { + s_copy(cu, " CblasLower", (ftnlen)14, (ftnlen)14); + } + if (*(unsigned char *)transa == 'N') { + s_copy(ca, " CblasNoTrans", (ftnlen)14, (ftnlen)14); + } else if (*(unsigned char *)transa == 'T') { + s_copy(ca, " CblasTrans", (ftnlen)14, (ftnlen)14); + } else { + s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14); + } + if (*iorder == 1) { + s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14); + } else { + s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); + } + io___387.ciunit = *nout; + s_wsfe(&io___387); + do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); + do_fio(&c__1, sname, (ftnlen)13); + do_fio(&c__1, crc, (ftnlen)14); + do_fio(&c__1, cu, (ftnlen)14); + do_fio(&c__1, ca, (ftnlen)14); + e_wsfe(); + io___388.ciunit = *nout; + s_wsfe(&io___388); + do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); + do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal)); + do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); + e_wsfe(); + return 0; +} /* zprcn7_ */ + + +/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, + integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, + integer *lda, logical *reset, doublecomplex *transl) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + doublereal d__1; + doublecomplex z__1, z__2; + + /* Local variables */ + integer ibeg, iend; + extern /* Double Complex */ VOID zbeg_(doublecomplex *, logical *); + logical unit; + integer i__, j; + logical lower, upper; + integer jj; + logical gen, her, tri, sym; + + +/* Generates values for an M by N matrix A. */ +/* Stores the values in the array AA in the data structure required */ +/* by the routine, with unwanted elements set to rogue value. */ + +/* TYPE is 'ge', 'he', 'sy' or 'tr'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *nmax; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + --aa; + + /* Function Body */ + gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0; + her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0; + sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0; + tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0; + upper = (her || sym || tri) && *(unsigned char *)uplo == 'U'; + lower = (her || sym || tri) && *(unsigned char *)uplo == 'L'; + unit = tri && *(unsigned char *)diag == 'U'; + +/* Generate data in array A. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + if (gen || upper && i__ <= j || lower && i__ >= j) { + i__3 = i__ + j * a_dim1; + zbeg_(&z__2, reset); + z__1.r = z__2.r + transl->r, z__1.i = z__2.i + transl->i; + a[i__3].r = z__1.r, a[i__3].i = z__1.i; + if (i__ != j) { +/* Set some elements to zero */ + if (*n > 3 && j == *n / 2) { + i__3 = i__ + j * a_dim1; + a[i__3].r = 0., a[i__3].i = 0.; + } + if (her) { + i__3 = j + i__ * a_dim1; + d_cnjg(&z__1, &a[i__ + j * a_dim1]); + a[i__3].r = z__1.r, a[i__3].i = z__1.i; + } else if (sym) { + i__3 = j + i__ * a_dim1; + i__4 = i__ + j * a_dim1; + a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i; + } else if (tri) { + i__3 = j + i__ * a_dim1; + a[i__3].r = 0., a[i__3].i = 0.; + } + } + } +/* L10: */ + } + if (her) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + d__1 = a[i__3].r; + z__1.r = d__1, z__1.i = 0.; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + } + if (tri) { + i__2 = j + j * a_dim1; + i__3 = j + j * a_dim1; + z__1.r = a[i__3].r + 1., z__1.i = a[i__3].i + 0.; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + } + if (unit) { + i__2 = j + j * a_dim1; + a[i__2].r = 1., a[i__2].i = 0.; + } +/* L20: */ + } + +/* Store elements in array AS in data structure required by routine. */ + + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L30: */ + } + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L40: */ + } +/* L50: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen) + 2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + if (unit) { + iend = j - 1; + } else { + iend = j; + } + } else { + if (unit) { + ibeg = j + 1; + } else { + ibeg = j; + } + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L60: */ + } + i__2 = iend; + for (i__ = ibeg; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + i__4 = i__ + j * a_dim1; + aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i; +/* L70: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + (j - 1) * *lda; + aa[i__3].r = -1e10, aa[i__3].i = 1e10; +/* L80: */ + } + if (her) { + jj = j + (j - 1) * *lda; + i__2 = jj; + i__3 = jj; + d__1 = aa[i__3].r; + z__1.r = d__1, z__1.i = -1e10; + aa[i__2].r = z__1.r, aa[i__2].i = z__1.i; + } +/* L90: */ + } + } + return 0; + +/* End of ZMAKE. */ + +} /* zmake_ */ + +/* Subroutine */ int zmmch_(char *transa, char *transb, integer *m, integer * + n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, + doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex * + c__, integer *ldc, doublecomplex *ct, doublereal *g, doublecomplex * + cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, + integer *nout, logical *mv) +{ + /* Format strings */ + static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS" + " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002 " + " EXPECTED RE\002,\002SULT COMPUTED R" + "ESULT\002)"; + static char fmt_9998[] = "(1x,i7,2(\002 (\002,g15.6,\002,\002,g15.6," + "\002)\002))"; + static char fmt_9997[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" + " \002,i3)"; + + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, + cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; + doublereal d__1, d__2, d__3, d__4, d__5, d__6; + doublecomplex z__1, z__2, z__3, z__4; + + /* Local variables */ + doublereal erri; + integer i__, j, k; + logical trana, tranb, ctrana, ctranb; + + /* Fortran I/O blocks */ + static cilist io___409 = { 0, 0, 0, fmt_9999, 0 }; + static cilist io___410 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___411 = { 0, 0, 0, fmt_9998, 0 }; + static cilist io___412 = { 0, 0, 0, fmt_9997, 0 }; + + + +/* Checks the results of the computational tests. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --ct; + --g; + cc_dim1 = *ldcc; + cc_offset = 1 + cc_dim1 * 1; + cc -= cc_offset; + + /* Function Body */ + trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == + 'C'; + tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == + 'C'; + ctrana = *(unsigned char *)transa == 'C'; + ctranb = *(unsigned char *)transb == 'C'; + +/* Compute expected result, one column at a time, in CT using data */ +/* in A, B and C. */ +/* Compute gauges in G. */ + + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + ct[i__3].r = 0., ct[i__3].i = 0.; + g[i__] = 0.; +/* L10: */ + } + if (! trana && ! tranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = k + j * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, + z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[ + i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag( + &a[i__ + k * a_dim1]), abs(d__2))) * ((d__3 = b[ + i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * + b_dim1]), abs(d__4))); +/* L20: */ + } +/* L30: */ + } + } else if (trana && ! tranb) { + if (ctrana) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + i__6 = k + j * b_dim1; + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.i = z__3.r * b[i__6].i + z__3.i * b[i__6] + .r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[k + j * b_dim1]), abs(d__4))); +/* L40: */ + } +/* L50: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = k + j * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = k + j * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[k + j * b_dim1]), abs(d__4))); +/* L60: */ + } +/* L70: */ + } + } + } else if (! trana && tranb) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + d_cnjg(&z__3, &b[j + k * b_dim1]); + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__3.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[j + k * b_dim1]), abs(d__4))); +/* L80: */ + } +/* L90: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = i__ + k * a_dim1; + i__7 = j + k * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7] + .i, z__2.i = a[i__6].r * b[i__7].i + a[i__6] + .i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = i__ + k * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * (( + d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag( + &b[j + k * b_dim1]), abs(d__4))); +/* L100: */ + } +/* L110: */ + } + } + } else if (trana && tranb) { + if (ctrana) { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + d_cnjg(&z__4, &b[j + k * b_dim1]); + z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, + z__2.i = z__3.r * z__4.i + z__3.i * + z__4.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L120: */ + } +/* L130: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + d_cnjg(&z__3, &a[k + i__ * a_dim1]); + i__6 = j + k * b_dim1; + z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, + z__2.i = z__3.r * b[i__6].i + z__3.i * b[ + i__6].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L140: */ + } +/* L150: */ + } + } + } else { + if (ctranb) { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + d_cnjg(&z__3, &b[j + k * b_dim1]); + z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, + z__2.i = a[i__6].r * z__3.i + a[i__6].i * + z__3.r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L160: */ + } +/* L170: */ + } + } else { + i__2 = *kk; + for (k = 1; k <= i__2; ++k) { + i__3 = *m; + for (i__ = 1; i__ <= i__3; ++i__) { + i__4 = i__; + i__5 = i__; + i__6 = k + i__ * a_dim1; + i__7 = j + k * b_dim1; + z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[ + i__7].i, z__2.i = a[i__6].r * b[i__7].i + + a[i__6].i * b[i__7].r; + z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + + z__2.i; + ct[i__4].r = z__1.r, ct[i__4].i = z__1.i; + i__4 = k + i__ * a_dim1; + i__5 = j + k * b_dim1; + g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = + d_imag(&a[k + i__ * a_dim1]), abs(d__2))) + * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 + = d_imag(&b[j + k * b_dim1]), abs(d__4))); +/* L180: */ + } +/* L190: */ + } + } + } + } + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__; + z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = + alpha->r * ct[i__4].i + alpha->i * ct[i__4].r; + i__5 = i__ + j * c_dim1; + z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = + beta->r * c__[i__5].i + beta->i * c__[i__5].r; + z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; + ct[i__3].r = z__1.r, ct[i__3].i = z__1.i; + i__3 = i__ + j * c_dim1; + g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), + abs(d__2))) * g[i__] + ((d__3 = beta->r, abs(d__3)) + ( + d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, + abs(d__5)) + (d__6 = d_imag(&c__[i__ + j * c_dim1]), abs( + d__6))); +/* L200: */ + } + +/* Compute the error ratio for this result. */ + + *err = 0.; + i__2 = *m; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__; + i__4 = i__ + j * cc_dim1; + z__2.r = ct[i__3].r - cc[i__4].r, z__2.i = ct[i__3].i - cc[i__4] + .i; + z__1.r = z__2.r, z__1.i = z__2.i; + erri = ((d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1), abs( + d__2))) / *eps; + if (g[i__] != 0.) { + erri /= g[i__]; + } + *err = f2cmax(*err,erri); + if (*err * sqrt(*eps) >= 1.) { + goto L230; + } +/* L210: */ + } + +/* L220: */ + } + +/* If the loop completes, all results are at least half accurate. */ + goto L250; + +/* Report fatal error. */ + +L230: + *fatal = TRUE_; + io___409.ciunit = *nout; + s_wsfe(&io___409); + e_wsfe(); + i__1 = *m; + for (i__ = 1; i__ <= i__1; ++i__) { + if (*mv) { + io___410.ciunit = *nout; + s_wsfe(&io___410); + do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal)); + do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof( + doublereal)); + e_wsfe(); + } else { + io___411.ciunit = *nout; + s_wsfe(&io___411); + do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); + do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof( + doublereal)); + do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal)); + e_wsfe(); + } +/* L240: */ + } + if (*n > 1) { + io___412.ciunit = *nout; + s_wsfe(&io___412); + do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); + e_wsfe(); + } + +L250: + return 0; + + +/* End of ZMMCH. */ + +} /* zmmch_ */ + +logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr) +{ + /* System generated locals */ + integer i__1, i__2, i__3; + logical ret_val; + + /* Local variables */ + integer i__; + + +/* Tests if two arrays are identical. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + --rj; + --ri; + + /* Function Body */ + i__1 = *lr; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = i__; + i__3 = i__; + if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) { + goto L20; + } +/* L10: */ + } + ret_val = TRUE_; + goto L30; +L20: + ret_val = FALSE_; +L30: + return ret_val; + +/* End of LZE. */ + +} /* lze_ */ + +logical lzeres_(char *type__, char *uplo, integer *m, integer *n, + doublecomplex *aa, doublecomplex *as, integer *lda) +{ + /* System generated locals */ + integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; + logical ret_val; + + /* Local variables */ + integer ibeg, iend, i__, j; + logical upper; + + +/* Tests if selected elements in two arrays are equal. */ + +/* TYPE is 'ge' or 'he' or 'sy'. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + /* Parameter adjustments */ + as_dim1 = *lda; + as_offset = 1 + as_dim1 * 1; + as -= as_offset; + aa_dim1 = *lda; + aa_offset = 1 + aa_dim1 * 1; + aa -= aa_offset; + + /* Function Body */ + upper = *(unsigned char *)uplo == 'U'; + if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + i__2 = *lda; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L10: */ + } +/* L20: */ + } + } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, + "sy", (ftnlen)2, (ftnlen)2) == 0) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + if (upper) { + ibeg = 1; + iend = j; + } else { + ibeg = j; + iend = *n; + } + i__2 = ibeg - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L30: */ + } + i__2 = *lda; + for (i__ = iend + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * aa_dim1; + i__4 = i__ + j * as_dim1; + if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) { + goto L70; + } +/* L40: */ + } +/* L50: */ + } + } + +/* L60: */ + ret_val = TRUE_; + goto L80; +L70: + ret_val = FALSE_; +L80: + return ret_val; + +/* End of LZERES. */ + +} /* lzeres_ */ + +/* Double Complex */ VOID zbeg_(doublecomplex * ret_val, logical *reset) +{ + /* System generated locals */ + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + static integer i__, j, ic, mi, mj; + + +/* Generates complex numbers as pairs of random numbers uniformly */ +/* distributed between -0.5 and 0.5. */ + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + if (*reset) { +/* Initialize local variables. */ + mi = 891; + mj = 457; + i__ = 7; + j = 7; + ic = 0; + *reset = FALSE_; + } + +/* The sequence of values of I or J is bounded between 1 and 999. */ +/* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */ +/* If initial I or J = 4 or 8, the period will be 25. */ +/* If initial I or J = 5, the period will be 10. */ +/* IC is used to break up the period by skipping 1 value of I or J */ +/* in 6. */ + + ++ic; +L10: + i__ *= mi; + j *= mj; + i__ -= i__ / 1000 * 1000; + j -= j / 1000 * 1000; + if (ic >= 5) { + ic = 0; + goto L10; + } + d__1 = (i__ - 500) / 1001.; + d__2 = (j - 500) / 1001.; + z__1.r = d__1, z__1.i = d__2; + ret_val->r = z__1.r, ret_val->i = z__1.i; + return ; + +/* End of ZBEG. */ + +} /* zbeg_ */ + +doublereal ddiff_(doublereal *x, doublereal *y) +{ + /* System generated locals */ + doublereal ret_val; + + +/* Auxiliary routine for test program for Level 3 Blas. */ + +/* -- Written on 8-February-1989. */ +/* Jack Dongarra, Argonne National Laboratory. */ +/* Iain Duff, AERE Harwell. */ +/* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ +/* Sven Hammarling, Numerical Algorithms Group Ltd. */ + + ret_val = *x - *y; + return ret_val; + +/* End of DDIFF. */ + +} /* ddiff_ */ + +/* Main program alias */ int zblat3_ () { MAIN__ (); return 0; } From 175e357f5d576e493cba9f180ad78800462f0c3f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 14:19:50 +0100 Subject: [PATCH 704/718] run apt-get update before fetching Ubuntu packages --- .github/workflows/dynamic_arch.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 49721958a..669aa8116 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -42,6 +42,7 @@ jobs: - name: Install Dependencies run: | if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get update sudo apt-get install -y gfortran cmake ccache libtinfo5 elif [ "$RUNNER_OS" == "macOS" ]; then # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. From a1ec94c258ac4151eb69012310e0f694f7067ea2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 17:46:07 +0100 Subject: [PATCH 705/718] Readd proper f2c'd sources for the GEMM3M tests --- ctest/c_cblat3c_3m.c | 1490 +++++-------------------- ctest/c_zblat3c_3m.c | 2454 +++++++++++++----------------------------- 2 files changed, 1043 insertions(+), 2901 deletions(-) diff --git a/ctest/c_cblat3c_3m.c b/ctest/c_cblat3c_3m.c index 9cfa26a41..b5d6bf9cb 100644 --- a/ctest/c_cblat3c_3m.c +++ b/ctest/c_cblat3c_3m.c @@ -10,25 +10,7 @@ #undef I #endif -#if defined(_WIN64) -typedef long long BLASLONG; -typedef unsigned long long BLASULONG; -#else -typedef long BLASLONG; -typedef unsigned long BLASULONG; -#endif - -#ifdef LAPACK_ILP64 -typedef BLASLONG blasint; -#if defined(_WIN64) -#define blasabs(x) llabs(x) -#else -#define blasabs(x) labs(x) -#endif -#else -typedef int blasint; -#define blasabs(x) abs(x) -#endif +#include "common.h" typedef blasint integer; @@ -247,7 +229,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,259 +242,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i= 0; if (trace) { - o__1.oerr = 0; +/* o__1.oerr = 0; o__1.ounit = ntra; o__1.ofnmlen = 32; o__1.ofnm = snaps; @@ -774,147 +411,122 @@ static logical c_false = FALSE_; o__1.oacc = 0; o__1.ofm = 0; o__1.oblnk = 0; - f_open(&o__1); + f_open(&o__1);*/ } /* Read the flag that directs rewinding of the snapshot file. */ - s_rsle(&io___7); - do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical)); - e_rsle(); - rewi = rewi && trace; + fgets(line,80,stdin); + sscanf(line,"%d",&rewi); + rewi = rewi && trace; /* Read the flag that directs stopping on any failure. */ - s_rsle(&io___9); - do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical)); - e_rsle(); -/* Read the flag that indicates whether error exits are to be tested. */ - s_rsle(&io___11); - do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical)); - e_rsle(); -/* Read the flag that indicates whether row-major data layout to be tested. */ - s_rsle(&io___13); - do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer)); - e_rsle(); -/* Read the threshold value of the test ratio */ - s_rsle(&io___15); - do_lio(&c__4, &c__1, (char *)&thresh, (ftnlen)sizeof(real)); - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + sfatal=FALSE_; + if (tmpchar=='T')sfatal=TRUE_; + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + tsterr=FALSE_; + if (tmpchar=='T')tsterr=TRUE_; + fgets(line,80,stdin); + sscanf(line,"%d",&layout); + fgets(line,80,stdin); + sscanf(line,"%f",&thresh); + /* Read and check the parameter values for the tests. */ /* Values of N */ - s_rsle(&io___17); - do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer)); - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nidim); +#else + sscanf(line,"%d",&nidim); +#endif + if (nidim < 1 || nidim > 9) { - s_wsfe(&io___19); - do_fio(&c__1, "N", (ftnlen)1); - do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer)); - e_wsfe(); + fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9"); goto L220; } - s_rsle(&io___20); - i__1 = nidim; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); - } - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#else + sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#endif i__1 = nidim; for (i__ = 1; i__ <= i__1; ++i__) { if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { - s_wsfe(&io___23); - do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer)); - e_wsfe(); + fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n"); goto L220; } /* L10: */ } /* Values of ALPHA */ - s_rsle(&io___24); - do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer)); - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nalf); +#else + sscanf(line,"%d",&nalf); +#endif if (nalf < 1 || nalf > 7) { - s_wsfe(&io___26); - do_fio(&c__1, "ALPHA", (ftnlen)5); - do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); - e_wsfe(); + fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n"); goto L220; } - s_rsle(&io___27); - i__1 = nalf; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex)); - } - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i, + &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i); + +// i__1 = nalf; +// for (i__ = 1; i__ <= i__1; ++i__) { +// do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex)); +// } /* Values of BETA */ - s_rsle(&io___29); - do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer)); - e_rsle(); - if (nbet < 1 || nbet > 7) { - s_wsfe(&io___31); - do_fio(&c__1, "BETA", (ftnlen)4); - do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); - e_wsfe(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nbet); +#else + sscanf(line,"%d",&nbet); +#endif + if (nalf < 1 || nbet > 7) { + fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n"); goto L220; } - s_rsle(&io___32); - i__1 = nbet; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__6, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof(complex)); - } - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i, + &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i); + /* Report values of parameters. */ - s_wsfe(&io___34); - e_wsfe(); - s_wsfe(&io___35); - i__1 = nidim; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); - } - e_wsfe(); - s_wsfe(&io___36); - i__1 = nalf; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(real)); - } - e_wsfe(); - s_wsfe(&io___37); - i__1 = nbet; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(real)); - } - e_wsfe(); + printf("TESTS OF THE COMPLEX LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n"); + printf(" FOR N"); + for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]); + printf("\n"); + printf(" FOR ALPHA"); + for (i__ =1; i__ <=nalf;++i__) printf(" (%f,%f)",alf[i__-1].r,alf[i__-1].i); + printf("\n"); + printf(" FOR BETA"); + for (i__ =1; i__ <=nbet;++i__) printf(" (%f,%f)",bet[i__-1].r,bet[i__-1].i); + printf("\n"); + if (! tsterr) { - s_wsle(&io___38); - e_wsle(); - s_wsfe(&io___39); - e_wsfe(); + printf(" ERROR-EXITS WILL NOT BE TESTED\n"); } - s_wsle(&io___40); - e_wsle(); - s_wsfe(&io___41); - do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(real)); - e_wsfe(); - s_wsle(&io___42); - e_wsle(); + printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %f\n",thresh); rorder = FALSE_; corder = FALSE_; if (layout == 2) { rorder = TRUE_; corder = TRUE_; - s_wsfe(&io___45); - e_wsfe(); + printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n"); } else if (layout == 1) { rorder = TRUE_; - s_wsfe(&io___46); - e_wsfe(); + printf("ROW-MAJOR DATA LAYOUT IS TESTED\n"); } else if (layout == 0) { corder = TRUE_; - s_wsfe(&io___47); - e_wsfe(); + printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n"); } - s_wsle(&io___48); - e_wsle(); /* Read names of subroutines and flags which indicate */ /* whether they are to be tested. */ @@ -924,42 +536,33 @@ static logical c_false = FALSE_; /* L20: */ } L30: - i__1 = s_rsfe(&io___50); - if (i__1 != 0) { + if (! fgets(line,80,stdin)) { goto L60; } - i__1 = do_fio(&c__1, snamet, (ftnlen)13); - if (i__1 != 0) { - goto L60; - } - i__1 = do_fio(&c__1, (char *)<estt, (ftnlen)sizeof(logical)); - if (i__1 != 0) { - goto L60; - } - i__1 = e_rsfe(); - if (i__1 != 0) { + i__1 = sscanf(line,"%12c %c",snamet,&tmpchar); + ltestt=FALSE_; + if (tmpchar=='T')ltestt=TRUE_; + if (i__1 < 2) { goto L60; } for (i__ = 1; i__ <= 9; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == + if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == 0) { goto L50; } /* L40: */ } - s_wsfe(&io___53); - do_fio(&c__1, snamet, (ftnlen)13); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet); + exit(1); L50: ltest[i__ - 1] = ltestt; goto L30; L60: - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = 5; cl__1.csta = 0; - f_clos(&cl__1); + f_clos(&cl__1);*/ /* Compute EPS (the machine precision). */ @@ -973,9 +576,7 @@ L70: goto L70; L80: eps += eps; - s_wsfe(&io___55); - do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(real)); - e_wsfe(); + printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps); /* Check the reliability of CMMCH using exact data. */ @@ -1015,13 +616,12 @@ L80: &c__6, &c_true); same = lce_(cc, ct, &n); if (! same || err != 0.f) { - s_wsfe(&io___68); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } *(unsigned char *)transb = 'C'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & @@ -1029,13 +629,12 @@ L80: &c__6, &c_true); same = lce_(cc, ct, &n); if (! same || err != 0.f) { - s_wsfe(&io___69); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } i__1 = n; for (j = 1; j <= i__1; ++j) { @@ -1061,13 +660,12 @@ L80: &c__6, &c_true); same = lce_(cc, ct, &n); if (! same || err != 0.f) { - s_wsfe(&io___70); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } *(unsigned char *)transb = 'C'; cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & @@ -1075,33 +673,26 @@ L80: &c__6, &c_true); same = lce_(cc, ct, &n); if (! same || err != 0.f) { - s_wsfe(&io___71); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } /* Test each subroutine in turn. */ for (isnum = 1; isnum <= 9; ++isnum) { - s_wsle(&io___73); - e_wsle(); if (! ltest[isnum - 1]) { /* Subprogram is not to be tested. */ - s_wsfe(&io___74); - do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13); - e_wsfe(); + printf("%12s WAS NOT TESTED\n",snames[isnum-1]); } else { - s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, ( - ftnlen)13); + s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, ( + ftnlen)12); /* Test error exits. */ if (tsterr) { - cc3chke_(snames + (isnum - 1) * 13); - s_wsle(&io___75); - e_wsle(); + cc3chke_(snames[isnum - 1]); } /* Test computations. */ infoc_1.infot = 0; @@ -1121,13 +712,13 @@ L80: /* Test CGEMM, 01. */ L140: if (corder) { - cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__0); } if (rorder) { - cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__1); @@ -1136,13 +727,13 @@ L140: /* Test CHEMM, 02, CSYMM, 03. */ L150: if (corder) { - cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__0); } if (rorder) { - cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__1); @@ -1151,13 +742,13 @@ L150: /* Test CTRMM, 04, CTRSM, 05. */ L160: if (corder) { - cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & c__0); } if (rorder) { - cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & c__1); @@ -1166,13 +757,13 @@ L160: /* Test CHERK, 06, CSYRK, 07. */ L170: if (corder) { - cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__0); } if (rorder) { - cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, cc, cs, ct, g, &c__1); @@ -1181,13 +772,13 @@ L170: /* Test CHER2K, 08, CSYR2K, 09. */ L180: if (corder) { - cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, &c__0); } if (rorder) { - cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, ct, g, w, &c__1); @@ -1201,32 +792,29 @@ L190: } /* L200: */ } - s_wsfe(&io___82); - e_wsfe(); + printf("\nEND OF TESTS\n"); goto L230; L210: - s_wsfe(&io___83); - e_wsfe(); + printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n"); goto L230; L220: - s_wsfe(&io___84); - e_wsfe(); - + printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n"); + printf("****** TESTS ABANDONED ******\n"); L230: if (trace) { - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = ntra; cl__1.csta = 0; - f_clos(&cl__1); + f_clos(&cl__1);*/ } - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = 6; cl__1.csta = 0; f_clos(&cl__1); - s_stop("", (ftnlen)0); - + s_stop("", (ftnlen)0);*/ + exit(0); /* End of CBLAT3. */ @@ -1244,30 +832,9 @@ L230: static char ich[3] = "NTC"; - /* Format strings */ - static char fmt_9994[] = "(\002 ****** FATAL ERROR - ERROR-CALL MYEXIT T" - "AKEN ON VALID\002,\002 CALL ******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; - /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; - alist al__1; /* Local variables */ complex beta; @@ -1288,7 +855,11 @@ L230: extern /* Subroutine */ int cprcn1_(integer *, integer *, char *, integer *, char *, char *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *); - integer ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; + integer ia, ib, ma, mb, na, nb, nc, ik, im, in; + extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer *, + integer *, integer *, complex *, complex *, integer *, complex *, + integer *, complex *, complex *, integer *); + integer ks, ms, ns; extern logical lceres_(char *, char *, integer *, integer *, complex *, complex *, integer *); char tranas[1], tranbs[1], transa[1], transb[1]; @@ -1297,20 +868,6 @@ L230: extern logical lce_(complex *, complex *, integer *); complex als, bls; real err; - extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer * - , integer *, integer *, complex *, complex *, integer *, complex * - , integer *, complex *, complex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___128 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___131 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___133 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___134 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___135 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___136 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___137 = { 0, 0, 0, fmt_9996, 0 }; - - /* Tests CGEMM. */ @@ -1497,20 +1054,21 @@ L230: &ldb, &beta, &ldc); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1); */ } - ccgemm3m_(iorder, transa, transb, &m, &n, &k, - &alpha, &aa[1], &lda, &bb[1], &ldb, & + ccgemm3m_(iorder, transa, transb, &m, &n, &k, & + alpha, &aa[1], &lda, &bb[1], &ldb, & beta, &cc[1], &ldc); /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___128.ciunit = *nout; - s_wsfe(&io___128); - e_wsfe(); +// io___128.ciunit = *nout; +// s_wsfe(&io___128); +// e_wsfe(); + printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L120; } @@ -1548,11 +1106,7 @@ L230: for (i__ = 1; i__ <= i__6; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___131.ciunit = *nout; - s_wsfe(&io___131); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);; } /* L40: */ } @@ -1606,51 +1160,34 @@ L100: if (errmax < *thresh) { if (*iorder == 0) { - io___133.ciunit = *nout; - s_wsfe(&io___133); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___134.ciunit = *nout; - s_wsfe(&io___134); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___135.ciunit = *nout; - s_wsfe(&io___135); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___136.ciunit = *nout; - s_wsfe(&io___136); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L130; L120: - io___137.ciunit = *nout; - s_wsfe(&io___137); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); cprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & lda, &ldb, &beta, &ldc); L130: return 0; -/* L9995: */ +/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */ +/* $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */ +/* $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */ /* End of CCHK1. */ @@ -1662,21 +1199,9 @@ L130: k, complex *alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002" - ",\002,f4.1,\002) , C,\002,i3,\002).\002)"; - /* Local variables */ char crc[14], cta[14], ctb[14]; - /* Fortran I/O blocks */ - static cilist io___141 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___142 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)transa == 'N') { s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); } else if (*(unsigned char *)transa == 'T') { @@ -1696,25 +1221,8 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___141.ciunit = *nout; - s_wsfe(&io___141); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cta, (ftnlen)14); - do_fio(&c__1, ctb, (ftnlen)14); - e_wsfe(); - io___142.ciunit = *nout; - s_wsfe(&io___142); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb); + printf("%d %d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); return 0; } /* cprcn1_ */ @@ -1731,30 +1239,9 @@ L130: static char ichs[2] = "LR"; static char ichu[2] = "UL"; - /* Format strings */ - static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; - /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; - alist al__1; /* Local variables */ complex beta; @@ -1798,17 +1285,6 @@ L130: integer icu; real err; - /* Fortran I/O blocks */ - static cilist io___181 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___184 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___186 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___187 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___188 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___189 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___190 = { 0, 0, 0, fmt_9996, 0 }; - - - /* Tests CHEMM and CSYMM. */ /* Auxiliary routine for test program for Level 3 Blas. */ @@ -1974,9 +1450,9 @@ L130: ; } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } if (conj) { cchemm_(iorder, side, uplo, &m, &n, &alpha, & @@ -1991,9 +1467,7 @@ L130: /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___181.ciunit = *nout; - s_wsfe(&io___181); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L110; } @@ -2028,11 +1502,7 @@ L130: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___184.ciunit = *nout; - s_wsfe(&io___184); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L40: */ } @@ -2090,51 +1560,34 @@ L90: if (errmax < *thresh) { if (*iorder == 0) { - io___186.ciunit = *nout; - s_wsfe(&io___186); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___187.ciunit = *nout; - s_wsfe(&io___187); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___188.ciunit = *nout; - s_wsfe(&io___188); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___189.ciunit = *nout; - s_wsfe(&io___189); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L120; L110: - io___190.ciunit = *nout; - s_wsfe(&io___190); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); cprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, &beta, &ldc); L120: return 0; -/* L9995: */ +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ /* End of CCHK2. */ @@ -2145,21 +1598,9 @@ L120: *iorder, char *side, char *uplo, integer *m, integer *n, complex * alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002," - "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)"; - /* Local variables */ char cs[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___194 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___195 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)side == 'L') { s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); } else { @@ -2175,24 +1616,8 @@ L120: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___194.ciunit = *nout; - s_wsfe(&io___194); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cs, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - e_wsfe(); - io___195.ciunit = *nout; - s_wsfe(&io___195); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); return 0; } /* cprcn2_ */ @@ -2210,31 +1635,10 @@ L120: static char ichd[2] = "UN"; static char ichs[2] = "LR"; - /* Format strings */ - static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; - /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; complex q__1; - alist al__1; /* Local variables */ char diag[1]; @@ -2279,17 +1683,6 @@ L120: integer ict, icu; real err; - /* Fortran I/O blocks */ - static cilist io___236 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___239 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___241 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___242 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___243 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___244 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___245 = { 0, 0, 0, fmt_9996, 0 }; - - - /* Tests CTRMM and CTRSM. */ /* Auxiliary routine for test program for Level 3 Blas. */ @@ -2444,14 +1837,14 @@ L120: if (*trace) { cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, - &n, &alpha, &lda, &ldb, ( - ftnlen)13, (ftnlen)1, (ftnlen) - 1, (ftnlen)1, (ftnlen)1); + &n, &alpha, &lda, &ldb/*, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1*/); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } cctrmm_(iorder, side, uplo, transa, diag, &m, &n, &alpha, &aa[1], &lda, &bb[ @@ -2461,14 +1854,14 @@ L120: if (*trace) { cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, - &n, &alpha, &lda, &ldb, ( - ftnlen)13, (ftnlen)1, (ftnlen) - 1, (ftnlen)1, (ftnlen)1); + &n, &alpha, &lda, &ldb/*, ( + ftnlen)12, (ftnlen)1, (ftnlen) + 1, (ftnlen)1, (ftnlen)1*/); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } cctrsm_(iorder, side, uplo, transa, diag, &m, &n, &alpha, &aa[1], &lda, &bb[ @@ -2478,9 +1871,7 @@ L120: /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___236.ciunit = *nout; - s_wsfe(&io___236); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L150; } @@ -2517,11 +1908,7 @@ L120: for (i__ = 1; i__ <= i__4; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___239.ciunit = *nout; - s_wsfe(&io___239); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L50: */ } @@ -2543,8 +1930,8 @@ L120: c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ 1], &ldb, eps, &err, - fatal, nout, &c_true, ( - ftnlen)1, (ftnlen)1); + fatal, nout, &c_true/*, ( + ftnlen)1, (ftnlen)1*/); } else { cmmch_("N", transa, &m, &n, &n, & alpha, &b[b_offset], nmax, @@ -2631,44 +2018,25 @@ L130: if (errmax < *thresh) { if (*iorder == 0) { - io___241.ciunit = *nout; - s_wsfe(&io___241); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___242.ciunit = *nout; - s_wsfe(&io___242); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___243.ciunit = *nout; - s_wsfe(&io___243); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___244.ciunit = *nout; - s_wsfe(&io___244); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L160; L150: - io___245.ciunit = *nout; - s_wsfe(&io___245); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); if (*trace) { cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & alpha, &lda, &ldb); @@ -2677,7 +2045,9 @@ L150: L160: return 0; -/* L9995: */ +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', */ +/* $ ' .' ) */ /* End of CCHK3. */ @@ -2688,21 +2058,9 @@ L160: *iorder, char *side, char *uplo, char *transa, char *diag, integer *m, integer *n, complex *alpha, integer *lda, integer *ldb) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,2(a14,\002,\002),2(i3,\002,\002),\002 " - "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)." - "\002)"; - /* Local variables */ char ca[14], cd[14], cs[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___251 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___252 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)side == 'L') { s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); } else { @@ -2730,24 +2088,9 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___251.ciunit = *nout; - s_wsfe(&io___251); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cs, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - e_wsfe(); - io___252.ciunit = *nout; - s_wsfe(&io___252); - do_fio(&c__1, ca, (ftnlen)14); - do_fio(&c__1, cd, (ftnlen)14); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf(" %s %s %d %d (%4.1f,%4.1f) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb); + return 0; } /* cprcn3_ */ @@ -2764,33 +2107,10 @@ L160: static char icht[2] = "NC"; static char ichu[2] = "UL"; - /* Format strings */ - static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; - /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; complex q__1; - alist al__1; /* Local variables */ complex beta; @@ -2841,18 +2161,6 @@ L160: integer ict, icu; real err; - /* Fortran I/O blocks */ - static cilist io___294 = { 0, 0, 0, fmt_9992, 0 }; - static cilist io___297 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___304 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___305 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___306 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___307 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___308 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___309 = { 0, 0, 0, fmt_9996, 0 }; - - - /* Tests CHERK and CSYRK. */ /* Auxiliary routine for test program for Level 3 Blas. */ @@ -2892,6 +2200,8 @@ L160: nc = 0; reset = TRUE_; errmax = 0.f; + rals = 1.f; + rbets = 1.f; i__1 = *nidim; for (in = 1; in <= i__1; ++in) { @@ -2965,8 +2275,8 @@ L160: } null = n <= 0; if (conj) { - null = null || (k <= 0 || ralpha == 0.f) && - rbeta == 1.f; + null = null || ((k <= 0 || ralpha == 0.f) && + rbeta == 1.f); } /* Generate the matrix C. */ @@ -3022,9 +2332,9 @@ L160: rbeta, &ldc); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } ccherk_(iorder, uplo, trans, &n, &k, &ralpha, &aa[1], &lda, &rbeta, &cc[1], &ldc); @@ -3035,9 +2345,9 @@ L160: beta, &ldc); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } ccsyrk_(iorder, uplo, trans, &n, &k, &alpha, & aa[1], &lda, &beta, &cc[1], &ldc); @@ -3046,9 +2356,7 @@ L160: /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___294.ciunit = *nout; - s_wsfe(&io___294); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L120; } @@ -3091,11 +2399,7 @@ L160: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___297.ciunit = *nout; - s_wsfe(&io___297); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L30: */ } @@ -3179,52 +2483,30 @@ L100: if (errmax < *thresh) { if (*iorder == 0) { - io___304.ciunit = *nout; - s_wsfe(&io___304); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___305.ciunit = *nout; - s_wsfe(&io___305); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___306.ciunit = *nout; - s_wsfe(&io___306); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___307.ciunit = *nout; - s_wsfe(&io___307); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L130; L110: if (n > 1) { - io___308.ciunit = *nout; - s_wsfe(&io___308); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); } L120: - io___309.ciunit = *nout; - s_wsfe(&io___309); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); if (conj) { cprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, &rbeta, &ldc); @@ -3236,8 +2518,12 @@ L120: L130: return 0; -/* L9994: */ -/* L9993: */ +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', */ +/* $ ' .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */ +/* $ '), C,', I3, ') .' ) */ /* End of CCHK4. */ @@ -3248,21 +2534,9 @@ L130: *iorder, char *uplo, char *transa, integer *n, integer *k, complex * alpha, integer *lda, complex *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C" - ",\002,i3,\002).\002)"; - /* Local variables */ char ca[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___313 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___314 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); } else { @@ -3280,23 +2554,8 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___313.ciunit = *nout; - s_wsfe(&io___313); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___314.ciunit = *nout; - s_wsfe(&io___314); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d (%4.1f,%4.1f) A %d (%4.1f,%4.1f) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc); return 0; } /* cprcn4_ */ @@ -3306,20 +2565,9 @@ L130: *iorder, char *uplo, char *transa, integer *n, integer *k, real * alpha, integer *lda, real *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3" - ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)"; - /* Local variables */ char ca[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___318 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___319 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); } else { @@ -3337,23 +2585,8 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___318.ciunit = *nout; - s_wsfe(&io___318); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___319.ciunit = *nout; - s_wsfe(&io___319); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d %4.1f A %d %4.1f C %d\n",*n,*k,*alpha,*lda,*beta,*ldc); return 0; } /* cprcn6_ */ @@ -3370,32 +2603,10 @@ L130: static char icht[2] = "NC"; static char ichu[2] = "UL"; - /* Format strings */ - static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; /* System generated locals */ integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; complex q__1, q__2; - alist al__1; /* Local variables */ integer jjab; @@ -3444,18 +2655,6 @@ L130: integer ict, icu; real err; - /* Fortran I/O blocks */ - static cilist io___362 = { 0, 0, 0, fmt_9992, 0 }; - static cilist io___365 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___373 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___374 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___375 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___376 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___377 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___378 = { 0, 0, 0, fmt_9996, 0 }; - - - /* Tests CHER2K and CSYR2K. */ /* Auxiliary routine for test program for Level 3 Blas. */ @@ -3578,8 +2777,8 @@ L130: } null = n <= 0; if (conj) { - null = null || (k <= 0 || alpha.r == 0.f && - alpha.i == 0.f) && rbeta == 1.f; + null = null || ((k <= 0 || (alpha.r == 0.f && + alpha.i == 0.f)) && rbeta == 1.f); } /* Generate the matrix C. */ @@ -3640,9 +2839,9 @@ L130: &rbeta, &ldc); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } ccher2k_(iorder, uplo, trans, &n, &k, &alpha, &aa[1], &lda, &bb[1], &ldb, &rbeta, & @@ -3654,9 +2853,9 @@ L130: &beta, &ldc); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } ccsyr2k_(iorder, uplo, trans, &n, &k, &alpha, &aa[1], &lda, &bb[1], &ldb, &beta, & @@ -3666,9 +2865,7 @@ L130: /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___362.ciunit = *nout; - s_wsfe(&io___362); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L150; } @@ -3708,11 +2905,7 @@ L130: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___365.ciunit = *nout; - s_wsfe(&io___365); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L40: */ } @@ -3745,7 +2938,7 @@ L130: i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { i__7 = i__; - i__8 = (j - 1 << 1) * *nmax + k + + i__8 = ((j - 1) << 1) * *nmax + k + i__; q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8].i, @@ -3757,14 +2950,14 @@ L130: if (conj) { i__7 = k + i__; r_cnjg(&q__2, &alpha); - i__8 = (j - 1 << 1) * *nmax + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[ i__8].r; w[i__7].r = q__1.r, w[i__7].i = q__1.i; } else { i__7 = k + i__; - i__8 = (j - 1 << 1) * *nmax + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] .i, q__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; @@ -3865,52 +3058,30 @@ L130: if (errmax < *thresh) { if (*iorder == 0) { - io___373.ciunit = *nout; - s_wsfe(&io___373); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___374.ciunit = *nout; - s_wsfe(&io___374); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___375.ciunit = *nout; - s_wsfe(&io___375); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___376.ciunit = *nout; - s_wsfe(&io___376); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L160; L140: if (n > 1) { - io___377.ciunit = *nout; - s_wsfe(&io___377); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); } L150: - io___378.ciunit = *nout; - s_wsfe(&io___378); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); if (conj) { cprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & ldb, &rbeta, &ldc); @@ -3922,8 +3093,12 @@ L150: L160: return 0; -/* L9994: */ -/* L9993: */ +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */ +/* $ ', C,', I3, ') .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ /* End of CCHK5. */ @@ -3934,21 +3109,10 @@ L160: *iorder, char *uplo, char *transa, integer *n, integer *k, complex * alpha, integer *lda, integer *ldb, complex *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002" - ",f4.1,\002), C,\002,i3,\002).\002)"; /* Local variables */ char ca[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___382 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___383 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); } else { @@ -3966,24 +3130,8 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___382.ciunit = *nout; - s_wsfe(&io___382); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___383.ciunit = *nout; - s_wsfe(&io___383); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); return 0; } /* cprcn5_ */ @@ -3993,21 +3141,10 @@ L160: *iorder, char *uplo, char *transa, integer *n, integer *k, complex * alpha, integer *lda, integer *ldb, real *beta, integer *ldc) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C," - "\002,i3,\002).\002)"; /* Local variables */ char ca[14], cu[14], crc[14]; - /* Fortran I/O blocks */ - static cilist io___387 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___388 = { 0, 0, 0, fmt_9994, 0 }; - - if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); } else { @@ -4025,24 +3162,8 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___387.ciunit = *nout; - s_wsfe(&io___387); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___388.ciunit = *nout; - s_wsfe(&io___388); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1f,%4.1f), A, %d, B, %d, %4.1f, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc); return 0; } /* cprcn7_ */ @@ -4101,7 +3222,7 @@ L160: for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { - if (gen || upper && i__ <= j || lower && i__ >= j) { + if (gen || (upper && i__ <= j) || (lower && i__ >= j)) { i__3 = i__ + j * a_dim1; cbeg_(&q__2, reset); q__1.r = q__2.r + transl->r, q__1.i = q__2.i + transl->i; @@ -4230,15 +3351,6 @@ L160: real *g, complex *cc, integer *ldcc, real *eps, real *err, logical * fatal, integer *nout, logical *mv) { - /* Format strings */ - static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS" - " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002 " - " EXPECTED RE\002,\002SULT COMPUTED R" - "ESULT\002)"; - static char fmt_9998[] = "(1x,i7,2(\002 (\002,g15.6,\002,\002,g15.6," - "\002)\002))"; - static char fmt_9997[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, @@ -4251,14 +3363,6 @@ L160: integer i__, j, k; logical trana, tranb, ctrana, ctranb; - /* Fortran I/O blocks */ - static cilist io___409 = { 0, 0, 0, fmt_9999, 0 }; - static cilist io___410 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___411 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___412 = { 0, 0, 0, fmt_9997, 0 }; - - - /* Checks the results of the computational tests. */ /* Auxiliary routine for test program for Level 3 Blas. */ @@ -4595,35 +3699,19 @@ L160: L230: *fatal = TRUE_; - io___409.ciunit = *nout; - s_wsfe(&io___409); - e_wsfe(); + printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n"); + printf(" EXPECTED RESULT COMPUTED RESULT\n"); i__1 = *m; for (i__ = 1; i__ <= i__1; ++i__) { if (*mv) { - io___410.ciunit = *nout; - s_wsfe(&io___410); - do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real)); - do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real) - ); - e_wsfe(); + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i); } else { - io___411.ciunit = *nout; - s_wsfe(&io___411); - do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real) - ); - do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real)); - e_wsfe(); + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i); } /* L240: */ } if (*n > 1) { - io___412.ciunit = *nout; - s_wsfe(&io___412); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d\n",j); } L250: @@ -4760,7 +3848,7 @@ logical lceres_(char *type__, char *uplo, integer *m, integer *n, complex *aa, } } -/* L60: */ +/* 60 CONTINUE */ ret_val = TRUE_; goto L80; L70: @@ -4851,4 +3939,4 @@ real sdiff_(real *x, real *y) } /* sdiff_ */ -/* Main program alias */ int cblat3_ () { MAIN__ (); return 0; } +/* Main program alias */ /*int cblat3_ () { MAIN__ (); return 0; }*/ diff --git a/ctest/c_zblat3c_3m.c b/ctest/c_zblat3c_3m.c index 059daccb5..0c76f11e7 100644 --- a/ctest/c_zblat3c_3m.c +++ b/ctest/c_zblat3c_3m.c @@ -10,25 +10,7 @@ #undef I #endif -#if defined(_WIN64) -typedef long long BLASLONG; -typedef unsigned long long BLASULONG; -#else -typedef long BLASLONG; -typedef unsigned long BLASULONG; -#endif - -#ifdef LAPACK_ILP64 -typedef BLASLONG blasint; -#if defined(_WIN64) -#define blasabs(x) llabs(x) -#else -#define blasabs(x) labs(x) -#endif -#else -typedef int blasint; -#define blasabs(x) abs(x) -#endif +#include "common.h" typedef blasint integer; @@ -40,14 +22,11 @@ typedef double doublereal; typedef struct { real r, i; } complex; typedef struct { doublereal r, i; } doublecomplex; #ifdef _MSC_VER -static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} -static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} #else static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} -static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} #endif #define pCf(z) (*_pCf(z)) @@ -247,7 +226,6 @@ typedef struct Namelist Namelist; #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } #define sig_die(s, kill) { exit(1); } #define s_stop(s, n) {exit(0);} -static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; #define z_abs(z) (cabs(Cd(z))) #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} @@ -261,259 +239,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; /* procedure parameter types for -A and -C++ */ #define F2C_proc_par_types 1 -#ifdef __cplusplus -typedef logical (*L_fp)(...); -#else -typedef logical (*L_fp)(); -#endif - -static float spow_ui(float x, integer n) { - float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static double dpow_ui(double x, integer n) { - double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#ifdef _MSC_VER -static _Fcomplex cpow_ui(complex x, integer n) { - complex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; - for(u = n; ; ) { - if(u & 01) pow.r *= x.r, pow.i *= x.i; - if(u >>= 1) x.r *= x.r, x.i *= x.i; - else break; - } - } - _Fcomplex p={pow.r, pow.i}; - return p; -} -#else -static _Complex float cpow_ui(_Complex float x, integer n) { - _Complex float pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -#ifdef _MSC_VER -static _Dcomplex zpow_ui(_Dcomplex x, integer n) { - _Dcomplex pow={1.0,0.0}; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; - for(u = n; ; ) { - if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; - if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; - else break; - } - } - _Dcomplex p = {pow._Val[0], pow._Val[1]}; - return p; -} -#else -static _Complex double zpow_ui(_Complex double x, integer n) { - _Complex double pow=1.0; unsigned long int u; - if(n != 0) { - if(n < 0) n = -n, x = 1/x; - for(u = n; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -#endif -static integer pow_ii(integer x, integer n) { - integer pow; unsigned long int u; - if (n <= 0) { - if (n == 0 || x == 1) pow = 1; - else if (x != -1) pow = x == 0 ? 1/x : 0; - else n = -n; - } - if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { - u = n; - for(pow = 1; ; ) { - if(u & 01) pow *= x; - if(u >>= 1) x *= x; - else break; - } - } - return pow; -} -static integer dmaxloc_(double *w, integer s, integer e, integer *n) -{ - double m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static integer smaxloc_(float *w, integer s, integer e, integer *n) -{ - float m; integer i, mi; - for(m=w[s-1], mi=s, i=s+1; i<=e; i++) - if (w[i-1]>m) mi=i ,m=w[i-1]; - return mi-s+1; -} -static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { - integer n = *n_, incx = *incx_, incy = *incy_, i; -#ifdef _MSC_VER - _Fcomplex zdotc = {0.0, 0.0}; - if (incx == 1 && incy == 1) { - for (i=0;i= 0; if (trace) { - o__1.oerr = 0; +/* o__1.oerr = 0; o__1.ounit = ntra; o__1.ofnmlen = 32; o__1.ofnm = snaps; @@ -783,149 +401,119 @@ static logical c_false = FALSE_; o__1.oacc = 0; o__1.ofm = 0; o__1.oblnk = 0; - f_open(&o__1); + f_open(&o__1);*/ } /* Read the flag that directs rewinding of the snapshot file. */ - s_rsle(&io___7); - do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical)); - e_rsle(); - rewi = rewi && trace; + fgets(line,80,stdin); + sscanf(line,"%d",&rewi); + rewi = rewi && trace; /* Read the flag that directs stopping on any failure. */ - s_rsle(&io___9); - do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical)); - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + sfatal=FALSE_; + if (tmpchar=='T')sfatal=TRUE_; /* Read the flag that indicates whether error exits are to be tested. */ - s_rsle(&io___11); - do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical)); - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"%c",&tmpchar); + tsterr=FALSE_; + if (tmpchar=='T')tsterr=TRUE_; /* Read the flag that indicates whether row-major data layout to be tested. */ - s_rsle(&io___13); - do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer)); - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"%d",&layout); /* Read the threshold value of the test ratio */ - s_rsle(&io___15); - do_lio(&c__5, &c__1, (char *)&thresh, (ftnlen)sizeof(doublereal)); - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"%lf",&thresh); /* Read and check the parameter values for the tests. */ /* Values of N */ - s_rsle(&io___17); - do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer)); - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%d",&nidim); +#else + sscanf(line,"%d",&nidim); +#endif if (nidim < 1 || nidim > 9) { - s_wsfe(&io___19); - do_fio(&c__1, "N", (ftnlen)1); - do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer)); - e_wsfe(); - goto L220; + fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9"); + goto L220; } - s_rsle(&io___20); - i__1 = nidim; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); - } - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#else + sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2], + &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]); +#endif i__1 = nidim; for (i__ = 1; i__ <= i__1; ++i__) { - if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { - s_wsfe(&io___23); - do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer)); - e_wsfe(); - goto L220; - } + if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) { + fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n"); + goto L220; + } /* L10: */ } /* Values of ALPHA */ - s_rsle(&io___24); - do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer)); - e_rsle(); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nalf); +#else + sscanf(line,"%d",&nalf); +#endif if (nalf < 1 || nalf > 7) { - s_wsfe(&io___26); - do_fio(&c__1, "ALPHA", (ftnlen)5); - do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); - e_wsfe(); - goto L220; - } - s_rsle(&io___27); - i__1 = nalf; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__7, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof( - doublecomplex)); + fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; } - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i, + &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i); + /* Values of BETA */ - s_rsle(&io___29); - do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer)); - e_rsle(); - if (nbet < 1 || nbet > 7) { - s_wsfe(&io___31); - do_fio(&c__1, "BETA", (ftnlen)4); - do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer)); - e_wsfe(); - goto L220; - } - s_rsle(&io___32); - i__1 = nbet; - for (i__ = 1; i__ <= i__1; ++i__) { - do_lio(&c__7, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof( - doublecomplex)); + fgets(line,80,stdin); +#ifdef USE64BITINT + sscanf(line,"%ld",&nbet); +#else + sscanf(line,"%d",&nbet); +#endif + if (nalf < 1 || nbet > 7) { + fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n"); + goto L220; } - e_rsle(); + fgets(line,80,stdin); + sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i, + &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i); /* Report values of parameters. */ - s_wsfe(&io___34); - e_wsfe(); - s_wsfe(&io___35); - i__1 = nidim; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer)); - } - e_wsfe(); - s_wsfe(&io___36); - i__1 = nalf; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(doublereal)); - } - e_wsfe(); - s_wsfe(&io___37); - i__1 = nbet; - for (i__ = 1; i__ <= i__1; ++i__) { - do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(doublereal)); - } - e_wsfe(); + printf("TESTS OF THE DOUBLE PRECISION COMPLEX LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n"); + printf(" FOR N"); + for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]); + printf("\n"); + printf(" FOR ALPHA"); + for (i__ =1; i__ <=nalf;++i__) printf(" (%lf,%lf)",alf[i__-1].r,alf[i__-1].i); + printf("\n"); + printf(" FOR BETA"); + for (i__ =1; i__ <=nbet;++i__) printf(" (%lf,%lf)",bet[i__-1].r,bet[i__-1].i); + printf("\n"); + if (! tsterr) { - s_wsle(&io___38); - e_wsle(); - s_wsfe(&io___39); - e_wsfe(); + printf(" ERROR-EXITS WILL NOT BE TESTED\n"); } - s_wsle(&io___40); - e_wsle(); - s_wsfe(&io___41); - do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(doublereal)); - e_wsfe(); - s_wsle(&io___42); - e_wsle(); + + printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %lf\n",thresh); rorder = FALSE_; corder = FALSE_; if (layout == 2) { rorder = TRUE_; corder = TRUE_; - s_wsfe(&io___45); - e_wsfe(); + printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n"); } else if (layout == 1) { rorder = TRUE_; - s_wsfe(&io___46); - e_wsfe(); + printf("ROW-MAJOR DATA LAYOUT IS TESTED\n"); } else if (layout == 0) { corder = TRUE_; - s_wsfe(&io___47); - e_wsfe(); + printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n"); } - s_wsle(&io___48); - e_wsle(); /* Read names of subroutines and flags which indicate */ /* whether they are to be tested. */ @@ -935,42 +523,33 @@ static logical c_false = FALSE_; /* L20: */ } L30: - i__1 = s_rsfe(&io___50); - if (i__1 != 0) { - goto L60; - } - i__1 = do_fio(&c__1, snamet, (ftnlen)13); - if (i__1 != 0) { - goto L60; - } - i__1 = do_fio(&c__1, (char *)<estt, (ftnlen)sizeof(logical)); - if (i__1 != 0) { - goto L60; + if (! fgets(line,80,stdin)) { + goto L60; } - i__1 = e_rsfe(); - if (i__1 != 0) { - goto L60; + i__1 = sscanf(line,"%12c %c",snamet,&tmpchar); + ltestt=FALSE_; + if (tmpchar=='T')ltestt=TRUE_; + if (i__1 < 2) { + goto L60; } for (i__ = 1; i__ <= 9; ++i__) { - if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == - 0) { - goto L50; - } + if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == + 0) { + goto L50; + } /* L40: */ } - s_wsfe(&io___53); - do_fio(&c__1, snamet, (ftnlen)13); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet); + exit(1); L50: ltest[i__ - 1] = ltestt; goto L30; L60: - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = 5; cl__1.csta = 0; - f_clos(&cl__1); + f_clos(&cl__1);*/ /* Compute EPS (the machine precision). */ @@ -984,9 +563,7 @@ L70: goto L70; L80: eps += eps; - s_wsfe(&io___55); - do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps); /* Check the reliability of ZMMCH using exact data. */ @@ -1023,30 +600,28 @@ L80: *(unsigned char *)transb = 'N'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, - &c__6, &c_true); + &c__6, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { - s_wsfe(&io___68); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } *(unsigned char *)transb = 'C'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, - &c__6, &c_true); + &c__6, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { - s_wsfe(&io___69); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } i__1 = n; for (j = 1; j <= i__1; ++j) { @@ -1069,56 +644,48 @@ L80: *(unsigned char *)transb = 'N'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, - &c__6, &c_true); + &c__6, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { - s_wsfe(&io___70); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } *(unsigned char *)transb = 'C'; zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], & c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, - &c__6, &c_true); + &c__6, &c_true, (ftnlen)1, (ftnlen)1); same = lze_(cc, ct, &n); if (! same || err != 0.) { - s_wsfe(&io___71); - do_fio(&c__1, transa, (ftnlen)1); - do_fio(&c__1, transb, (ftnlen)1); - do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical)); - do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal)); - e_wsfe(); - s_stop("", (ftnlen)0); + printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n"); + printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb); + printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err); + printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n"); + printf("****** TESTS ABANDONED ******\n"); + exit(1); } /* Test each subroutine in turn. */ for (isnum = 1; isnum <= 9; ++isnum) { - s_wsle(&io___73); - e_wsle(); if (! ltest[isnum - 1]) { /* Subprogram is not to be tested. */ - s_wsfe(&io___74); - do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13); - e_wsfe(); + printf("%12s WAS NOT TESTED\n",snames[isnum-1]); } else { - s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, ( - ftnlen)13); + s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, ( + ftnlen)12); /* Test error exits. */ if (tsterr) { - cz3chke_(snames + (isnum - 1) * 13); - s_wsle(&io___75); - e_wsle(); + cz3chke_(snames[isnum - 1], (ftnlen)12); } /* Test computations. */ infoc_1.infot = 0; infoc_1.ok = TRUE_; fatal = FALSE_; - switch (isnum) { + switch ((int)isnum) { case 1: goto L140; case 2: goto L150; case 3: goto L150; @@ -1132,76 +699,76 @@ L80: /* Test ZGEMM, 01. */ L140: if (corder) { - zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__0); + cc, cs, ct, g, &c__0, (ftnlen)12); } if (rorder) { - zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__1); + cc, cs, ct, g, &c__1, (ftnlen)12); } goto L190; /* Test ZHEMM, 02, ZSYMM, 03. */ L150: if (corder) { - zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__0); + cc, cs, ct, g, &c__0, (ftnlen)12); } if (rorder) { - zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__1); + cc, cs, ct, g, &c__1, (ftnlen)12); } goto L190; /* Test ZTRMM, 04, ZTRSM, 05. */ L160: if (corder) { - zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & - c__0); + c__0, (ftnlen)12); } if (rorder) { - zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, & - c__1); + c__1, (ftnlen)12); } goto L190; /* Test ZHERK, 06, ZSYRK, 07. */ L170: if (corder) { - zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__0); + cc, cs, ct, g, &c__0, (ftnlen)12); } if (rorder) { - zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__, - cc, cs, ct, g, &c__1); + cc, cs, ct, g, &c__1, (ftnlen)12); } goto L190; /* Test ZHER2K, 08, ZSYR2K, 09. */ L180: if (corder) { - zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, - ct, g, w, &c__0); + ct, g, w, &c__0, (ftnlen)12); } if (rorder) { - zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra, + zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra, &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, & nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, - ct, g, w, &c__1); + ct, g, w, &c__1, (ftnlen)12); } goto L190; @@ -1212,122 +779,66 @@ L190: } /* L200: */ } - s_wsfe(&io___82); - e_wsfe(); + printf("\nEND OF TESTS\n"); goto L230; L210: - s_wsfe(&io___83); - e_wsfe(); + printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n"); goto L230; L220: - s_wsfe(&io___84); - e_wsfe(); + printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n"); + printf("****** TESTS ABANDONED ******\n"); L230: if (trace) { - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = ntra; cl__1.csta = 0; - f_clos(&cl__1); + f_clos(&cl__1);*/ } - cl__1.cerr = 0; +/* cl__1.cerr = 0; cl__1.cunit = 6; cl__1.csta = 0; - f_clos(&cl__1); - s_stop("", (ftnlen)0); - + f_clos(&cl__1);*/ + exit(0); /* End of ZBLAT3. */ - return 0; } /* MAIN__ */ -/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, - integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, - doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * - g, integer *iorder) +/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ - static char ich[3] = "NTC"; - - /* Format strings */ - static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; + static char ich[3+1] = "NTC"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; - alist al__1; /* Local variables */ - extern /* Subroutine */ int czgemm3m_(integer *, char *, char *, integer * - , integer *, integer *, doublecomplex *, doublecomplex *, integer - *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *); - doublecomplex beta; - integer ldas, ldbs, ldcs; - logical same, null; - integer i__, k, m, n; - doublecomplex alpha; - logical isame[13], trana, tranb; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *, - logical *, doublecomplex *); - integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *); - logical reset; - integer ia, ib; - extern /* Subroutine */ int zprcn1_(integer *, integer *, char *, integer - *, char *, char *, integer *, integer *, integer *, doublecomplex - *, integer *, integer *, doublecomplex *, integer *); - integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; - char tranas[1], tranbs[1], transa[1], transb[1]; - doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *); - integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; - doublecomplex als, bls; - doublereal err; - extern logical lze_(doublecomplex *, doublecomplex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___128 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___131 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___133 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___134 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___135 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___136 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___137 = { 0, 0, 0, fmt_9996, 0 }; - - + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same, null; + static integer i__, k, m, n; + static doublecomplex alpha; + static logical isame[13], trana, tranb; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static integer ia, ib; + extern /* Subroutine */ int zprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns; + extern /* Subroutine */ void czgemm3m_(integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char tranas[1], tranbs[1], transa[1], transb[1]; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc; + static doublecomplex als, bls; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZGEMM. */ @@ -1339,6 +850,17 @@ L230: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ /* Parameter adjustments */ --idim; --alf; @@ -1362,6 +884,7 @@ L230: a -= a_offset; /* Function Body */ +/* .. Executable Statements .. */ nargs = 13; nc = 0; @@ -1418,7 +941,8 @@ L230: /* Generate the matrix A. */ zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[ - 1], &lda, &reset, &c_b1); + 1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, ( + ftnlen)1); for (icb = 1; icb <= 3; ++icb) { *(unsigned char *)transb = *(unsigned char *)&ich[icb @@ -1447,7 +971,8 @@ L230: /* Generate the matrix B. */ zmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, & - bb[1], &ldb, &reset, &c_b1); + bb[1], &ldb, &reset, &c_b1, (ftnlen)2, ( + ftnlen)1, (ftnlen)1); i__4 = *nalf; for (ia = 1; ia <= i__4; ++ia) { @@ -1462,7 +987,8 @@ L230: /* Generate the matrix C. */ zmake_("ge", " ", " ", &m, &n, &c__[c_offset], - nmax, &cc[1], &ldc, &reset, &c_b1); + nmax, &cc[1], &ldc, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1511,23 +1037,23 @@ L230: if (*trace) { zprcn1_(ntra, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, &lda, - &ldb, &beta, &ldc); + &ldb, &beta, &ldc, (ftnlen)12, ( + ftnlen)1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } - czgemm3m_(iorder, transa, transb, &m, &n, &k, - &alpha, &aa[1], &lda, &bb[1], &ldb, & - beta, &cc[1], &ldc); + czgemm3m_(iorder, transa, transb, &m, &n, &k, & + alpha, &aa[1], &lda, &bb[1], &ldb, & + beta, &cc[1], &ldc, (ftnlen)1, ( + ftnlen)1); /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___128.ciunit = *nout; - s_wsfe(&io___128); - e_wsfe(); + printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L120; } @@ -1553,7 +1079,8 @@ L230: isame[11] = lze_(&cs[1], &cc[1], &lcc); } else { isame[11] = lzeres_("ge", " ", &m, &n, & - cs[1], &cc[1], &ldc); + cs[1], &cc[1], &ldc, (ftnlen)2, ( + ftnlen)1); } isame[12] = ldcs == ldc; @@ -1565,11 +1092,7 @@ L230: for (i__ = 1; i__ <= i__6; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___131.ciunit = *nout; - s_wsfe(&io___131); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L40: */ } @@ -1586,7 +1109,8 @@ L230: &a[a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, - eps, &err, fatal, nout, &c_true); + eps, &err, fatal, nout, &c_true, + (ftnlen)1, (ftnlen)1); errmax = f2cmax(errmax,err); /* If got really bad answer, report and */ /* return. */ @@ -1623,76 +1147,44 @@ L100: if (errmax < *thresh) { if (*iorder == 0) { - io___133.ciunit = *nout; - s_wsfe(&io___133); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___134.ciunit = *nout; - s_wsfe(&io___134); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___135.ciunit = *nout; - s_wsfe(&io___135); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___136.ciunit = *nout; - s_wsfe(&io___136); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L130; L120: - io___137.ciunit = *nout; - s_wsfe(&io___137); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); zprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, & - lda, &ldb, &beta, &ldc); + lda, &ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); L130: return 0; -/* L9995: */ +/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */ +/* $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */ +/* $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */ /* End of ZCHK1. */ } /* zchk1_ */ -/* Subroutine */ int zprcn1_(integer *nout, integer *nc, char *sname, integer - *iorder, char *transa, char *transb, integer *m, integer *n, integer * - k, doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex * - beta, integer *ldc) +/* Subroutine */ int zprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002" - ",\002,f4.1,\002) , C,\002,i3,\002).\002)"; - /* Local variables */ - char crc[14], cta[14], ctb[14]; - - /* Fortran I/O blocks */ - static cilist io___141 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___142 = { 0, 0, 0, fmt_9994, 0 }; - + static char crc[14], cta[14], ctb[14]; if (*(unsigned char *)transa == 'N') { s_copy(cta, " CblasNoTrans", (ftnlen)14, (ftnlen)14); @@ -1713,123 +1205,52 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___141.ciunit = *nout; - s_wsfe(&io___141); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cta, (ftnlen)14); - do_fio(&c__1, ctb, (ftnlen)14); - e_wsfe(); - io___142.ciunit = *nout; - s_wsfe(&io___142); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb); + printf("%d %d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; } /* zprcn1_ */ -/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, - integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, - doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * - g, integer *iorder) +/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ - static char ichs[2] = "LR"; - static char ichu[2] = "UL"; - - /* Format strings */ - static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; + static char ichs[2+1] = "LR"; + static char ichu[2+1] = "UL"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; - alist al__1; /* Local variables */ - doublecomplex beta; - integer ldas, ldbs, ldcs; - logical same; - char side[1]; - logical conj, left, null; - char uplo[1]; - integer i__, m, n; - doublecomplex alpha; - logical isame[13]; - char sides[1]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *, - logical *, doublecomplex *); - integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *); - logical reset; - char uplos[1]; - integer ia, ib; - extern /* Subroutine */ int zprcn2_(integer *, integer *, char *, integer - *, char *, char *, integer *, integer *, doublecomplex *, integer - *, integer *, doublecomplex *, integer *); - integer na, nc, im, in, ms, ns; - extern /* Subroutine */ int czhemm_(integer *, char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *); - doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *); - extern /* Subroutine */ int czsymm_(integer *, char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *); - integer laa, lbb, lda, lcc, ldb, ldc, ics; - doublecomplex als, bls; - integer icu; - doublereal err; - extern logical lze_(doublecomplex *, doublecomplex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___181 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___184 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___186 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___187 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___188 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___189 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___190 = { 0, 0, 0, fmt_9996, 0 }; - - + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same; + static char side[1]; + static logical isconj, left, null; + static char uplo[1]; + static integer i__, m, n; + static doublecomplex alpha; + static logical isame[13]; + static char sides[1]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static char uplos[1]; + static integer ia, ib; + extern /* Subroutine */ int zprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer na, nc, im, in, ms, ns; + extern /* Subroutine */ void czhemm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void czsymm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lbb, lda, lcc, ldb, ldc, ics; + static doublecomplex als, bls; + static integer icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHEMM and ZSYMM. */ @@ -1841,6 +1262,17 @@ L130: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ /* Parameter adjustments */ --idim; --alf; @@ -1864,7 +1296,8 @@ L130: a -= a_offset; /* Function Body */ - conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; nargs = 12; nc = 0; @@ -1903,7 +1336,7 @@ L130: /* Generate the matrix B. */ zmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, & - reset, &c_b1); + reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); for (ics = 1; ics <= 2; ++ics) { *(unsigned char *)side = *(unsigned char *)&ichs[ics - 1]; @@ -1931,7 +1364,8 @@ L130: /* Generate the hermitian or symmetric matrix A. */ zmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax, - &aa[1], &lda, &reset, &c_b1); + &aa[1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen) + 1, (ftnlen)1); i__3 = *nalf; for (ia = 1; ia <= i__3; ++ia) { @@ -1946,7 +1380,8 @@ L130: /* Generate the matrix C. */ zmake_("ge", " ", " ", &m, &n, &c__[c_offset], - nmax, &cc[1], &ldc, &reset, &c_b1); + nmax, &cc[1], &ldc, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -1992,30 +1427,28 @@ L130: if (*trace) { zprcn2_(ntra, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, &beta, & - ldc) + ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1) ; } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } - if (conj) { + if (isconj) { czhemm_(iorder, side, uplo, &m, &n, &alpha, & aa[1], &lda, &bb[1], &ldb, &beta, &cc[ - 1], &ldc); + 1], &ldc, (ftnlen)1, (ftnlen)1); } else { czsymm_(iorder, side, uplo, &m, &n, &alpha, & aa[1], &lda, &bb[1], &ldb, &beta, &cc[ - 1], &ldc); + 1], &ldc, (ftnlen)1, (ftnlen)1); } /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___181.ciunit = *nout; - s_wsfe(&io___181); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L110; } @@ -2038,7 +1471,7 @@ L130: isame[10] = lze_(&cs[1], &cc[1], &lcc); } else { isame[10] = lzeres_("ge", " ", &m, &n, &cs[1], - &cc[1], &ldc); + &cc[1], &ldc, (ftnlen)2, (ftnlen)1); } isame[11] = ldcs == ldc; @@ -2050,11 +1483,7 @@ L130: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___184.ciunit = *nout; - s_wsfe(&io___184); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L40: */ } @@ -2072,13 +1501,15 @@ L130: a_offset], nmax, &b[b_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, - &err, fatal, nout, &c_true); + &err, fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); } else { zmmch_("N", "N", &m, &n, &n, &alpha, &b[ b_offset], nmax, &a[a_offset], nmax, &beta, &c__[c_offset], nmax, &ct[1], &g[1], &cc[1], &ldc, eps, - &err, fatal, nout, &c_true); + &err, fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); } errmax = f2cmax(errmax,err); /* If got really bad answer, report and */ @@ -2112,76 +1543,44 @@ L90: if (errmax < *thresh) { if (*iorder == 0) { - io___186.ciunit = *nout; - s_wsfe(&io___186); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___187.ciunit = *nout; - s_wsfe(&io___187); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___188.ciunit = *nout; - s_wsfe(&io___188); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___189.ciunit = *nout; - s_wsfe(&io___189); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L120; L110: - io___190.ciunit = *nout; - s_wsfe(&io___190); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); zprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, - &beta, &ldc); + &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); L120: return 0; -/* L9995: */ +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ /* End of ZCHK2. */ } /* zchk2_ */ -/* Subroutine */ int zprcn2_(integer *nout, integer *nc, char *sname, integer - *iorder, char *side, char *uplo, integer *m, integer *n, - doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta, - integer *ldc) +/* Subroutine */ int zprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002," - "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)"; - /* Local variables */ - char cs[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___194 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___195 = { 0, 0, 0, fmt_9994, 0 }; - + static char cs[14], cu[14], crc[14]; if (*(unsigned char *)side == 'L') { s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); @@ -2198,123 +1597,57 @@ L120: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___194.ciunit = *nout; - s_wsfe(&io___194); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cs, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - e_wsfe(); - io___195.ciunit = *nout; - s_wsfe(&io___195); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; } /* zprcn2_ */ -/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, - integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nmax, doublecomplex *a, doublecomplex *aa, - doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex - *bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, integer * - iorder) +/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* ct, doublereal* g, doublecomplex* c__, integer* iorder, ftnlen sname_len) { /* Initialized data */ - static char ichu[2] = "UL"; - static char icht[3] = "NTC"; - static char ichd[2] = "UN"; - static char ichs[2] = "LR"; - - /* Format strings */ - static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; + static char ichu[2+1] = "UL"; + static char icht[3+1] = "NTC"; + static char ichd[2+1] = "UN"; + static char ichs[2+1] = "LR"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1; - alist al__1; /* Local variables */ - char diag[1]; - integer ldas, ldbs; - logical same; - char side[1]; - logical left, null; - char uplo[1]; - integer i__, j, m, n; - doublecomplex alpha; - char diags[1]; - logical isame[13]; - char sides[1]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *, - logical *, doublecomplex *); - integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *); - logical reset; - char uplos[1]; - integer ia, na; - extern /* Subroutine */ int zprcn3_(integer *, integer *, char *, integer - *, char *, char *, char *, char *, integer *, integer *, - doublecomplex *, integer *, integer *); - integer nc, im, in, ms, ns; - char tranas[1], transa[1]; - doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *); - extern /* Subroutine */ int cztrmm_(integer *, char *, char *, char *, - char *, integer *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, integer *), cztrsm_(integer *, char *, char *, char *, char *, - integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *); - integer laa, icd, lbb, lda, ldb, ics; - doublecomplex als; - integer ict, icu; - doublereal err; - extern logical lze_(doublecomplex *, doublecomplex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___236 = { 0, 0, 0, fmt_9994, 0 }; - static cilist io___239 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___241 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___242 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___243 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___244 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___245 = { 0, 0, 0, fmt_9996, 0 }; - - + static char diag[1]; + static integer ldas, ldbs; + static logical same; + static char side[1]; + static logical left, null; + static char uplo[1]; + static integer i__, j, m, n; + static doublecomplex alpha; + static char diags[1]; + static logical isame[13]; + static char sides[1]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static logical reset; + static char uplos[1]; + static integer ia, na; + extern /* Subroutine */ int zprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen); + static integer nc, im, in, ms, ns; + static char tranas[1], transa[1]; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + extern /* Subroutine */ void cztrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ void cztrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen); + static integer laa, icd, lbb, lda, ldb, ics; + static doublecomplex als; + static integer ict, icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZTRMM and ZTRSM. */ @@ -2326,6 +1659,17 @@ L120: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ /* Parameter adjustments */ --idim; --alf; @@ -2346,6 +1690,7 @@ L120: a -= a_offset; /* Function Body */ +/* .. Executable Statements .. */ nargs = 11; nc = 0; @@ -2421,12 +1766,14 @@ L120: zmake_("tr", uplo, diag, &na, &na, &a[ a_offset], nmax, &aa[1], &lda, &reset, - &c_b1); + &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); /* Generate the matrix B. */ zmake_("ge", " ", " ", &m, &n, &b[b_offset], - nmax, &bb[1], &ldb, &reset, &c_b1); + nmax, &bb[1], &ldb, &reset, &c_b1, ( + ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -2471,42 +1818,42 @@ L120: zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, &alpha, &lda, &ldb, ( - ftnlen)13, (ftnlen)1, (ftnlen) + ftnlen)12, (ftnlen)1, (ftnlen) 1, (ftnlen)1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } cztrmm_(iorder, side, uplo, transa, diag, &m, &n, &alpha, &aa[1], &lda, &bb[ - 1], &ldb); + 1], &ldb, (ftnlen)1, (ftnlen)1, ( + ftnlen)1, (ftnlen)1); } else if (s_cmp(sname + 9, "sm", (ftnlen)2, ( ftnlen)2) == 0) { if (*trace) { zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, &alpha, &lda, &ldb, ( - ftnlen)13, (ftnlen)1, (ftnlen) + ftnlen)12, (ftnlen)1, (ftnlen) 1, (ftnlen)1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } cztrsm_(iorder, side, uplo, transa, diag, &m, &n, &alpha, &aa[1], &lda, &bb[ - 1], &ldb); + 1], &ldb, (ftnlen)1, (ftnlen)1, ( + ftnlen)1, (ftnlen)1); } /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___236.ciunit = *nout; - s_wsfe(&io___236); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L150; } @@ -2531,7 +1878,8 @@ L120: isame[9] = lze_(&bs[1], &bb[1], &lbb); } else { isame[9] = lzeres_("ge", " ", &m, &n, &bs[ - 1], &bb[1], &ldb); + 1], &bb[1], &ldb, (ftnlen)2, ( + ftnlen)1); } isame[10] = ldbs == ldb; @@ -2543,11 +1891,7 @@ L120: for (i__ = 1; i__ <= i__4; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___239.ciunit = *nout; - s_wsfe(&io___239); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L50: */ } @@ -2578,7 +1922,8 @@ L120: c_b1, &c__[c_offset], nmax, &ct[1], &g[1], &bb[ 1], &ldb, eps, &err, - fatal, nout, &c_true); + fatal, nout, &c_true, ( + ftnlen)1, (ftnlen)1); } } else if (s_cmp(sname + 9, "sm", (ftnlen) 2, (ftnlen)2) == 0) { @@ -2612,7 +1957,8 @@ L120: c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & ldb, eps, &err, fatal, - nout, &c_false); + nout, &c_false, (ftnlen)1, + (ftnlen)1); } else { zmmch_("N", transa, &m, &n, &n, & c_b2, &c__[c_offset], @@ -2620,7 +1966,8 @@ L120: &c_b1, &b[b_offset], nmax, &ct[1], &g[1], &bb[1], & ldb, eps, &err, fatal, - nout, &c_false); + nout, &c_false, (ftnlen)1, + (ftnlen)1); } } errmax = f2cmax(errmax,err); @@ -2657,77 +2004,48 @@ L130: if (errmax < *thresh) { if (*iorder == 0) { - io___241.ciunit = *nout; - s_wsfe(&io___241); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___242.ciunit = *nout; - s_wsfe(&io___242); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___243.ciunit = *nout; - s_wsfe(&io___243); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___244.ciunit = *nout; - s_wsfe(&io___244); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L160; L150: - io___245.ciunit = *nout; - s_wsfe(&io___245); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); if (*trace) { zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, & - alpha, &lda, &ldb); + alpha, &lda, &ldb, (ftnlen)12, (ftnlen)1, (ftnlen)1, (ftnlen) + 1, (ftnlen)1); } L160: return 0; -/* L9995: */ +/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', */ +/* $ ' .' ) */ /* End of ZCHK3. */ } /* zchk3_ */ -/* Subroutine */ int zprcn3_(integer *nout, integer *nc, char *sname, integer - *iorder, char *side, char *uplo, char *transa, char *diag, integer *m, - integer *n, doublecomplex *alpha, integer *lda, integer *ldb) +/* Subroutine */ int zprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002," - "a14,\002,\002,a14,\002,\002)"; - static char fmt_9994[] = "(10x,2(a15,\002,\002),2(i3,\002,\002),\002 " - "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)." - "\002)"; /* Local variables */ - char ca[14], cd[14], cs[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___251 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___252 = { 0, 0, 0, fmt_9994, 0 }; - + static char ca[14], cd[14], cs[14], cu[14], crc[14]; if (*(unsigned char *)side == 'L') { s_copy(cs, " CblasLeft", (ftnlen)14, (ftnlen)14); @@ -2756,134 +2074,61 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___251.ciunit = *nout; - s_wsfe(&io___251); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cs, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - e_wsfe(); - io___252.ciunit = *nout; - s_wsfe(&io___252); - do_fio(&c__1, ca, (ftnlen)14); - do_fio(&c__1, cd, (ftnlen)14); - do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu); + printf(" %s %s %d %d (%4.1lf,%4.1lf) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb); + +return 0; } /* zprcn3_ */ -/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, - integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, - doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, - doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal * - g, integer *iorder) +/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len) { /* Initialized data */ - static char icht[2] = "NC"; - static char ichu[2] = "UL"; - - /* Format strings */ - static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; + static char icht[2+1] = "NC"; + static char ichu[2+1] = "UL"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7; doublecomplex z__1; - alist al__1; /* Local variables */ - doublecomplex beta; - integer ldas, ldcs; - logical same, conj; - doublecomplex bets; - doublereal rals; - logical tran, null; - char uplo[1]; - integer i__, j, k, n; - doublecomplex alpha; - doublereal rbeta; - logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *, - logical *, doublecomplex *); - integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *); - doublereal rbets; - logical reset; - char trans[1]; - logical upper; - char uplos[1]; - integer ia, ib, jc, ma, na; - extern /* Subroutine */ int zprcn4_(integer *, integer *, char *, integer - *, char *, char *, integer *, integer *, doublecomplex *, integer - *, doublecomplex *, integer *); - integer nc; - extern /* Subroutine */ int zprcn6_(integer *, integer *, char *, integer - *, char *, char *, integer *, integer *, doublereal *, integer *, - doublereal *, integer *); - integer ik, in, jj, lj, ks, ns; - doublereal ralpha; - extern /* Subroutine */ int czherk_(integer *, char *, char *, integer *, - integer *, doublereal *, doublecomplex *, integer *, doublereal *, - doublecomplex *, integer *); - doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *); - char transs[1], transt[1]; - extern /* Subroutine */ int czsyrk_(integer *, char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, doublecomplex *, integer *); - integer laa, lda, lcc, ldc; - doublecomplex als; - integer ict, icu; - doublereal err; - extern logical lze_(doublecomplex *, doublecomplex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___294 = { 0, 0, 0, fmt_9992, 0 }; - static cilist io___297 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___304 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___305 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___306 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___307 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___308 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___309 = { 0, 0, 0, fmt_9996, 0 }; - - + static doublecomplex beta; + static integer ldas, ldcs; + static logical same, isconj; + static doublecomplex bets; + static doublereal rals; + static logical tran, null; + static char uplo[1]; + static integer i__, j, k, n; + static doublecomplex alpha; + static doublereal rbeta; + static logical isame[13]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static doublereal rbets; + static logical reset; + static char trans[1]; + static logical upper; + static char uplos[1]; + static integer ia, ib, jc, ma, na; + extern /* Subroutine */ int zprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + static integer nc; + extern /* Subroutine */ int zprcn6_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + static integer ik, in, jj, lj, ks, ns; + static doublereal ralpha; + extern /* Subroutine */ int czherk_(integer*, char*, char*, integer*, integer*, doublereal*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char transs[1], transt[1]; + extern /* Subroutine */ int czsyrk_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lda, lcc, ldc; + static doublecomplex als; + static integer ict, icu; + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHERK and ZSYRK. */ @@ -2895,6 +2140,17 @@ L160: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ /* Parameter adjustments */ --idim; --alf; @@ -2918,12 +2174,15 @@ L160: a -= a_offset; /* Function Body */ - conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; nargs = 10; nc = 0; reset = TRUE_; errmax = 0.; + rals = 1.; + rbets = 1.; i__1 = *nidim; for (in = 1; in <= i__1; ++in) { @@ -2946,7 +2205,7 @@ L160: for (ict = 1; ict <= 2; ++ict) { *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; tran = *(unsigned char *)trans == 'C'; - if (tran && ! conj) { + if (tran && ! isconj) { *(unsigned char *)trans = 'T'; } if (tran) { @@ -2970,7 +2229,7 @@ L160: /* Generate the matrix A. */ zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], & - lda, &reset, &c_b1); + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); for (icu = 1; icu <= 2; ++icu) { *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1]; @@ -2980,7 +2239,7 @@ L160: for (ia = 1; ia <= i__3; ++ia) { i__4 = ia; alpha.r = alf[i__4].r, alpha.i = alf[i__4].i; - if (conj) { + if (isconj) { ralpha = alpha.r; z__1.r = ralpha, z__1.i = 0.; alpha.r = z__1.r, alpha.i = z__1.i; @@ -2990,22 +2249,22 @@ L160: for (ib = 1; ib <= i__4; ++ib) { i__5 = ib; beta.r = bet[i__5].r, beta.i = bet[i__5].i; - if (conj) { + if (isconj) { rbeta = beta.r; z__1.r = rbeta, z__1.i = 0.; beta.r = z__1.r, beta.i = z__1.i; } null = n <= 0; - if (conj) { - null = null || (k <= 0 || ralpha == 0.) && - rbeta == 1.; + if (isconj) { + null = null ||( (k <= 0 || ralpha == 0.) && + rbeta == 1.); } /* Generate the matrix C. */ zmake_(sname + 7, uplo, " ", &n, &n, &c__[ c_offset], nmax, &cc[1], &ldc, &reset, & - c_b1); + c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -3016,7 +2275,7 @@ L160: trans; ns = n; ks = k; - if (conj) { + if (isconj) { rals = ralpha; } else { als.r = alpha.r, als.i = alpha.i; @@ -3030,7 +2289,7 @@ L160: /* L10: */ } ldas = lda; - if (conj) { + if (isconj) { rbets = rbeta; } else { bets.r = beta.r, bets.i = beta.i; @@ -3047,40 +2306,42 @@ L160: /* Call the subroutine. */ - if (conj) { + if (isconj) { if (*trace) { zprcn6_(ntra, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, & - rbeta, &ldc); + rbeta, &ldc, (ftnlen)12, (ftnlen) + 1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } czherk_(iorder, uplo, trans, &n, &k, &ralpha, - &aa[1], &lda, &rbeta, &cc[1], &ldc); + &aa[1], &lda, &rbeta, &cc[1], &ldc, ( + ftnlen)1, (ftnlen)1); } else { if (*trace) { zprcn4_(ntra, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & - beta, &ldc); + beta, &ldc, (ftnlen)12, (ftnlen)1, + (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } czsyrk_(iorder, uplo, trans, &n, &k, &alpha, & - aa[1], &lda, &beta, &cc[1], &ldc); + aa[1], &lda, &beta, &cc[1], &ldc, ( + ftnlen)1, (ftnlen)1); } /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___294.ciunit = *nout; - s_wsfe(&io___294); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L120; } @@ -3093,7 +2354,7 @@ L160: char *)trans; isame[2] = ns == n; isame[3] = ks == k; - if (conj) { + if (isconj) { isame[4] = rals == ralpha; } else { isame[4] = als.r == alpha.r && als.i == @@ -3101,7 +2362,7 @@ L160: } isame[5] = lze_(&as[1], &aa[1], &laa); isame[6] = ldas == lda; - if (conj) { + if (isconj) { isame[7] = rbets == rbeta; } else { isame[7] = bets.r == beta.r && bets.i == @@ -3111,7 +2372,8 @@ L160: isame[8] = lze_(&cs[1], &cc[1], &lcc); } else { isame[8] = lzeres_(sname + 7, uplo, &n, &n, & - cs[1], &cc[1], &ldc); + cs[1], &cc[1], &ldc, (ftnlen)2, ( + ftnlen)1); } isame[9] = ldcs == ldc; @@ -3123,11 +2385,7 @@ L160: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___297.ciunit = *nout; - s_wsfe(&io___297); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); } /* L30: */ } @@ -3140,7 +2398,7 @@ L160: /* Check the result column by column. */ - if (conj) { + if (isconj) { *(unsigned char *)transt = 'C'; } else { *(unsigned char *)transt = 'T'; @@ -3162,7 +2420,8 @@ L160: nmax, &beta, &c__[jj + j * c_dim1], nmax, &ct[1], &g[1], &cc[jc], &ldc, eps, &err, - fatal, nout, &c_true); + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); } else { zmmch_("N", transt, &lj, &c__1, &k, & alpha, &a[jj + a_dim1], nmax, @@ -3170,7 +2429,7 @@ L160: c__[jj + j * c_dim1], nmax, & ct[1], &g[1], &cc[jc], &ldc, eps, &err, fatal, nout, & - c_true); + c_true, (ftnlen)1, (ftnlen)1); } if (upper) { jc += ldc; @@ -3211,89 +2470,57 @@ L100: if (errmax < *thresh) { if (*iorder == 0) { - io___304.ciunit = *nout; - s_wsfe(&io___304); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___305.ciunit = *nout; - s_wsfe(&io___305); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___306.ciunit = *nout; - s_wsfe(&io___306); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___307.ciunit = *nout; - s_wsfe(&io___307); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L130; L110: if (n > 1) { - io___308.ciunit = *nout; - s_wsfe(&io___308); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); } L120: - io___309.ciunit = *nout; - s_wsfe(&io___309); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); - if (conj) { + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (isconj) { zprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, - &rbeta, &ldc); + &rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); } else { zprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & - beta, &ldc); + beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); } L130: return 0; -/* L9994: */ -/* L9993: */ +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', */ +/* $ ' .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */ +/* $ '), C,', I3, ') .' ) */ /* End of CCHK4. */ } /* zchk4_ */ -/* Subroutine */ int zprcn4_(integer *nout, integer *nc, char *sname, integer - *iorder, char *uplo, char *transa, integer *n, integer *k, - doublecomplex *alpha, integer *lda, doublecomplex *beta, integer *ldc) +/* Subroutine */ int zprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C" - ",\002,i3,\002).\002)"; - /* Local variables */ - char ca[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___313 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___314 = { 0, 0, 0, fmt_9994, 0 }; - + static char ca[14], cu[14], crc[14]; if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); @@ -3312,45 +2539,19 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___313.ciunit = *nout; - s_wsfe(&io___313); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___314.ciunit = *nout; - s_wsfe(&io___314); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d (%4.1lf,%4.1lf) A %d (%4.1lf,%4.1lf) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc); + +return 0; } /* zprcn4_ */ -/* Subroutine */ int zprcn6_(integer *nout, integer *nc, char *sname, integer - *iorder, char *uplo, char *transa, integer *n, integer *k, doublereal - *alpha, integer *lda, doublereal *beta, integer *ldc) +/* Subroutine */ int zprcn6_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3" - ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)"; /* Local variables */ - char ca[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___318 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___319 = { 0, 0, 0, fmt_9994, 0 }; - + static char ca[14], cu[14], crc[14]; if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); @@ -3369,132 +2570,58 @@ L130: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___318.ciunit = *nout; - s_wsfe(&io___318); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___319.ciunit = *nout; - s_wsfe(&io___319); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("( %d %d %4.1lf A %d %4.1lf C %d\n",*n,*k,*alpha,*lda,*beta,*ldc); + +return 0; } /* zprcn6_ */ -/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, - integer *nout, integer *ntra, logical *trace, logical *rewi, logical * - fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex * - alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex * - ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, - doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, - doublecomplex *cs, doublecomplex *ct, doublereal *g, doublecomplex *w, - integer *iorder) +/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* ab, doublecomplex* aa, doublecomplex* as, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, doublecomplex* w, integer* iorder, ftnlen sname_len) { /* Initialized data */ - static char icht[2] = "NC"; - static char ichu[2] = "UL"; - - /* Format strings */ - static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT " - "TAKEN ON VALID\002,\002 CALL *******\002)"; - static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER" - " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)"; - static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR C" - "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)"; - static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO" - "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR " - " COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **" - "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP" - "ECT *******\002)"; - static char fmt_9995[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; - static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM" - "BER:\002)"; + static char icht[2+1] = "NC"; + static char ichu[2+1] = "UL"; /* System generated locals */ integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8; doublecomplex z__1, z__2; - alist al__1; /* Local variables */ - integer jjab; - doublecomplex beta; - integer ldas, ldbs, ldcs; - logical same, conj; - doublecomplex bets; - logical tran, null; - char uplo[1]; - integer i__, j, k, n; - doublecomplex alpha; - doublereal rbeta; - logical isame[13]; - extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *, - logical *, doublecomplex *); - integer nargs; - extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, doublereal *, doublecomplex *, - integer *, doublereal *, doublereal *, logical *, integer *, - logical *); - doublereal rbets; - logical reset; - char trans[1]; - logical upper; - char uplos[1]; - integer ia, ib, jc, ma, na, nc; - extern /* Subroutine */ int zprcn5_(integer *, integer *, char *, integer - *, char *, char *, integer *, integer *, doublecomplex *, integer - *, integer *, doublecomplex *, integer *), - zprcn7_(integer *, integer *, char *, integer *, char *, char *, - integer *, integer *, doublecomplex *, integer *, integer *, - doublereal *, integer *); - integer ik, in, jj, lj, ks, ns; - doublereal errmax; - extern logical lzeres_(char *, char *, integer *, integer *, - doublecomplex *, doublecomplex *, integer *); - char transs[1], transt[1]; - extern /* Subroutine */ int czher2k_(integer *, char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublereal *, doublecomplex *, - integer *); - integer laa, lbb, lda, lcc, ldb, ldc; - doublecomplex als; - integer ict, icu; - extern /* Subroutine */ int czsyr2k_(integer *, char *, char *, integer *, - integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *); - doublereal err; - extern logical lze_(doublecomplex *, doublecomplex *, integer *); - - /* Fortran I/O blocks */ - static cilist io___362 = { 0, 0, 0, fmt_9992, 0 }; - static cilist io___365 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___373 = { 0, 0, 0, fmt_10000, 0 }; - static cilist io___374 = { 0, 0, 0, fmt_10001, 0 }; - static cilist io___375 = { 0, 0, 0, fmt_10002, 0 }; - static cilist io___376 = { 0, 0, 0, fmt_10003, 0 }; - static cilist io___377 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___378 = { 0, 0, 0, fmt_9996, 0 }; - - + static integer jjab; + static doublecomplex beta; + static integer ldas, ldbs, ldcs; + static logical same, isconj; + static doublecomplex bets; + static logical tran, null; + static char uplo[1]; + static integer i__, j, k, n; + static doublecomplex alpha; + static doublereal rbeta; + static logical isame[13]; + extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen); + static integer nargs; + extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen); + static doublereal rbets; + static logical reset; + static char trans[1]; + static logical upper; + static char uplos[1]; + static integer ia, ib, jc, ma, na, nc; + extern /* Subroutine */ int zprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen); + extern /* Subroutine */ int zprcn7_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen); + static integer ik, in, jj, lj, ks, ns; + static doublereal errmax; + extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static char transs[1], transt[1]; + extern /* Subroutine */ int czher2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen); + static integer laa, lbb, lda, lcc, ldb, ldc; + static doublecomplex als; + static integer ict, icu; + extern /* Subroutine */ int czsyr2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen); + static doublereal err; + extern logical lze_(doublecomplex*, doublecomplex*, integer*); /* Tests ZHER2K and ZSYR2K. */ @@ -3506,6 +2633,17 @@ L130: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Local Arrays .. */ +/* .. External Functions .. */ +/* .. External Subroutines .. */ +/* .. Intrinsic Functions .. */ +/* .. Scalars in Common .. */ +/* .. Common blocks .. */ +/* .. Data statements .. */ /* Parameter adjustments */ --idim; --alf; @@ -3525,7 +2663,8 @@ L130: --ab; /* Function Body */ - conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; +/* .. Executable Statements .. */ + isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0; nargs = 12; nc = 0; @@ -3553,7 +2692,7 @@ L130: for (ict = 1; ict <= 2; ++ict) { *(unsigned char *)trans = *(unsigned char *)&icht[ict - 1]; tran = *(unsigned char *)trans == 'C'; - if (tran && ! conj) { + if (tran && ! isconj) { *(unsigned char *)trans = 'T'; } if (tran) { @@ -3579,10 +2718,12 @@ L130: if (tran) { i__3 = *nmax << 1; zmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], & - lda, &reset, &c_b1); + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); } else { zmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], & - lda, &reset, &c_b1); + lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen) + 1); } /* Generate the matrix B. */ @@ -3592,10 +2733,12 @@ L130: if (tran) { i__3 = *nmax << 1; zmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1] - , &ldb, &reset, &c_b1); + , &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen)1, ( + ftnlen)1); } else { zmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax, - &bb[1], &ldb, &reset, &c_b1); + &bb[1], &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen) + 1, (ftnlen)1); } for (icu = 1; icu <= 2; ++icu) { @@ -3611,22 +2754,22 @@ L130: for (ib = 1; ib <= i__4; ++ib) { i__5 = ib; beta.r = bet[i__5].r, beta.i = bet[i__5].i; - if (conj) { + if (isconj) { rbeta = beta.r; z__1.r = rbeta, z__1.i = 0.; beta.r = z__1.r, beta.i = z__1.i; } null = n <= 0; - if (conj) { - null = null || (k <= 0 || alpha.r == 0. && - alpha.i == 0.) && rbeta == 1.; + if (isconj) { + null = null ||( (k <= 0 || (alpha.r == 0. && + alpha.i == 0.)) && rbeta == 1.); } /* Generate the matrix C. */ zmake_(sname + 7, uplo, " ", &n, &n, &c__[ c_offset], nmax, &cc[1], &ldc, &reset, & - c_b1); + c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1); ++nc; @@ -3656,7 +2799,7 @@ L130: /* L20: */ } ldbs = ldb; - if (conj) { + if (isconj) { rbets = rbeta; } else { bets.r = beta.r, bets.i = beta.i; @@ -3673,42 +2816,42 @@ L130: /* Call the subroutine. */ - if (conj) { + if (isconj) { if (*trace) { zprcn7_(ntra, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &ldb, - &rbeta, &ldc); + &rbeta, &ldc, (ftnlen)12, ( + ftnlen)1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } czher2k_(iorder, uplo, trans, &n, &k, &alpha, &aa[1], &lda, &bb[1], &ldb, &rbeta, & - cc[1], &ldc); + cc[1], &ldc, (ftnlen)1, (ftnlen)1); } else { if (*trace) { zprcn5_(ntra, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &ldb, - &beta, &ldc); + &beta, &ldc, (ftnlen)12, (ftnlen) + 1, (ftnlen)1); } if (*rewi) { - al__1.aerr = 0; +/* al__1.aerr = 0; al__1.aunit = *ntra; - f_rew(&al__1); + f_rew(&al__1);*/ } czsyr2k_(iorder, uplo, trans, &n, &k, &alpha, &aa[1], &lda, &bb[1], &ldb, &beta, & - cc[1], &ldc); + cc[1], &ldc, (ftnlen)1, (ftnlen)1); } /* Check if error-exit was taken incorrectly. */ if (! infoc_1.ok) { - io___362.ciunit = *nout; - s_wsfe(&io___362); - e_wsfe(); + printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n"); *fatal = TRUE_; goto L150; } @@ -3726,7 +2869,7 @@ L130: isame[6] = ldas == lda; isame[7] = lze_(&bs[1], &bb[1], &lbb); isame[8] = ldbs == ldb; - if (conj) { + if (isconj) { isame[9] = rbets == rbeta; } else { isame[9] = bets.r == beta.r && bets.i == @@ -3736,7 +2879,7 @@ L130: isame[10] = lze_(&cs[1], &cc[1], &lcc); } else { isame[10] = lzeres_("he", uplo, &n, &n, &cs[1] - , &cc[1], &ldc); + , &cc[1], &ldc, (ftnlen)2, (ftnlen)1); } isame[11] = ldcs == ldc; @@ -3748,12 +2891,8 @@ L130: for (i__ = 1; i__ <= i__5; ++i__) { same = same && isame[i__ - 1]; if (! isame[i__ - 1]) { - io___365.ciunit = *nout; - s_wsfe(&io___365); - do_fio(&c__1, (char *)&i__, (ftnlen) - sizeof(integer)); - e_wsfe(); - } + printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__); + } /* L40: */ } if (! same) { @@ -3765,7 +2904,7 @@ L130: /* Check the result column by column. */ - if (conj) { + if (isconj) { *(unsigned char *)transt = 'C'; } else { *(unsigned char *)transt = 'T'; @@ -3785,7 +2924,7 @@ L130: i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { i__7 = i__; - i__8 = (j - 1 << 1) * *nmax + k + + i__8 = ((j - 1) << 1) * *nmax + k + i__; z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8].i, @@ -3794,17 +2933,17 @@ L130: i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; - if (conj) { + if (isconj) { i__7 = k + i__; d_cnjg(&z__2, &alpha); - i__8 = (j - 1 << 1) * *nmax + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[ i__8].r; w[i__7].r = z__1.r, w[i__7].i = z__1.i; } else { i__7 = k + i__; - i__8 = (j - 1 << 1) * *nmax + i__; + i__8 = ((j - 1) << 1) * *nmax + i__; z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8] .i, z__1.i = alpha.r * ab[i__8].i + alpha.i * ab[i__8].r; @@ -3820,11 +2959,12 @@ L130: 1], &i__8, &beta, &c__[jj + j * c_dim1], nmax, &ct[1], &g[1] , &cc[jc], &ldc, eps, &err, - fatal, nout, &c_true); + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); } else { i__6 = k; for (i__ = 1; i__ <= i__6; ++i__) { - if (conj) { + if (isconj) { i__7 = i__; d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]); z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, @@ -3861,7 +3001,8 @@ L130: i__7, &beta, &c__[jj + j * c_dim1], nmax, &ct[1], &g[1], &cc[jc], &ldc, eps, &err, - fatal, nout, &c_true); + fatal, nout, &c_true, (ftnlen) + 1, (ftnlen)1); } if (upper) { jc += ldc; @@ -3905,90 +3046,57 @@ L130: if (errmax < *thresh) { if (*iorder == 0) { - io___373.ciunit = *nout; - s_wsfe(&io___373); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } if (*iorder == 1) { - io___374.ciunit = *nout; - s_wsfe(&io___374); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - e_wsfe(); + printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc); } } else { if (*iorder == 0) { - io___375.ciunit = *nout; - s_wsfe(&io___375); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } if (*iorder == 1) { - io___376.ciunit = *nout; - s_wsfe(&io___376); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc); + printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax); } } goto L160; L140: if (n > 1) { - io___377.ciunit = *nout; - s_wsfe(&io___377); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d:\n",j); } L150: - io___378.ciunit = *nout; - s_wsfe(&io___378); - do_fio(&c__1, sname, (ftnlen)13); - e_wsfe(); - if (conj) { + printf(" ******* %s FAILED ON CALL NUMBER:\n",sname); + if (isconj) { zprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & - ldb, &rbeta, &ldc); + ldb, &rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); } else { zprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, & - ldb, &beta, &ldc); + ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1); } L160: return 0; -/* L9994: */ -/* L9993: */ +/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */ +/* $ ', C,', I3, ') .' ) */ +/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */ +/* $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */ +/* $ ',', F4.1, '), C,', I3, ') .' ) */ /* End of ZCHK5. */ } /* zchk5_ */ -/* Subroutine */ int zprcn5_(integer *nout, integer *nc, char *sname, integer - *iorder, char *uplo, char *transa, integer *n, integer *k, - doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta, - integer *ldc) +/* Subroutine */ int zprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002" - ",f4.1,\002), C,\002,i3,\002).\002)"; - /* Local variables */ - char ca[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___382 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___383 = { 0, 0, 0, fmt_9994, 0 }; - + static char ca[14], cu[14], crc[14]; if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); @@ -4007,48 +3115,19 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___382.ciunit = *nout; - s_wsfe(&io___382); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___383.ciunit = *nout; - s_wsfe(&io___383); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc); + +return 0; } /* zprcn5_ */ -/* Subroutine */ int zprcn7_(integer *nout, integer *nc, char *sname, integer - *iorder, char *uplo, char *transa, integer *n, integer *k, - doublecomplex *alpha, integer *lda, integer *ldb, doublereal *beta, - integer *ldc) +/* Subroutine */ int zprcn7_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len) { - /* Format strings */ - static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002," - "\002))"; - static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002" - ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C," - "\002,i3,\002).\002)"; /* Local variables */ - char ca[14], cu[14], crc[14]; - - /* Fortran I/O blocks */ - static cilist io___387 = { 0, 0, 0, fmt_9995, 0 }; - static cilist io___388 = { 0, 0, 0, fmt_9994, 0 }; - + static char ca[14], cu[14], crc[14]; if (*(unsigned char *)uplo == 'U') { s_copy(cu, " CblasUpper", (ftnlen)14, (ftnlen)14); @@ -4067,31 +3146,14 @@ L160: } else { s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14); } - io___387.ciunit = *nout; - s_wsfe(&io___387); - do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer)); - do_fio(&c__1, sname, (ftnlen)13); - do_fio(&c__1, crc, (ftnlen)14); - do_fio(&c__1, cu, (ftnlen)14); - do_fio(&c__1, ca, (ftnlen)14); - e_wsfe(); - io___388.ciunit = *nout; - s_wsfe(&io___388); - do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer)); - do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal)); - do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer)); - e_wsfe(); - return 0; + printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca); + printf("%d %d (%4.1lf,%4.1lf), A, %d, B, %d, %4.1lf, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc); + +return 0; } /* zprcn7_ */ -/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, - integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, - integer *lda, logical *reset, doublecomplex *transl) +/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; @@ -4099,13 +3161,13 @@ L160: doublecomplex z__1, z__2; /* Local variables */ - integer ibeg, iend; - extern /* Double Complex */ VOID zbeg_(doublecomplex *, logical *); - logical unit; - integer i__, j; - logical lower, upper; - integer jj; - logical gen, her, tri, sym; + static integer ibeg, iend; + extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*); + static logical unit; + static integer i__, j; + static logical lower, upper; + static integer jj; + static logical gen, her, tri, sym; /* Generates values for an M by N matrix A. */ @@ -4122,6 +3184,13 @@ L160: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. External Functions .. */ +/* .. Intrinsic Functions .. */ +/* .. Executable Statements .. */ /* Parameter adjustments */ a_dim1 = *nmax; a_offset = 1 + a_dim1 * 1; @@ -4143,7 +3212,7 @@ L160: for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { - if (gen || upper && i__ <= j || lower && i__ >= j) { + if (gen || (upper && i__ <= j) || (lower && i__ >= j)) { i__3 = i__ + j * a_dim1; zbeg_(&z__2, reset); z__1.r = z__2.r + transl->r, z__1.i = z__2.i + transl->i; @@ -4266,22 +3335,8 @@ L160: } /* zmake_ */ -/* Subroutine */ int zmmch_(char *transa, char *transb, integer *m, integer * - n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, - doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex * - c__, integer *ldc, doublecomplex *ct, doublereal *g, doublecomplex * - cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, - integer *nout, logical *mv) +/* Subroutine */ int zmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublecomplex* alpha, doublecomplex* a, integer* lda, doublecomplex* b, integer* ldb, doublecomplex* beta, doublecomplex* c__, integer* ldc, doublecomplex* ct, doublereal* g, doublecomplex* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len) { - /* Format strings */ - static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS" - " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002 " - " EXPECTED RE\002,\002SULT COMPUTED R" - "ESULT\002)"; - static char fmt_9998[] = "(1x,i7,2(\002 (\002,g15.6,\002,\002,g15.6," - "\002)\002))"; - static char fmt_9997[] = "(\002 THESE ARE THE RESULTS FOR COLUMN" - " \002,i3)"; /* System generated locals */ integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, @@ -4289,18 +3344,11 @@ L160: doublereal d__1, d__2, d__3, d__4, d__5, d__6; doublecomplex z__1, z__2, z__3, z__4; + double sqrt(double); /* Local variables */ - doublereal erri; - integer i__, j, k; - logical trana, tranb, ctrana, ctranb; - - /* Fortran I/O blocks */ - static cilist io___409 = { 0, 0, 0, fmt_9999, 0 }; - static cilist io___410 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___411 = { 0, 0, 0, fmt_9998, 0 }; - static cilist io___412 = { 0, 0, 0, fmt_9997, 0 }; - - + static doublereal erri; + static integer i__, j, k; + static logical trana, tranb, ctrana, ctranb; /* Checks the results of the computational tests. */ @@ -4312,6 +3360,14 @@ L160: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Parameters .. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Intrinsic Functions .. */ +/* .. Statement Functions .. */ +/* .. Statement Function definitions .. */ +/* .. Executable Statements .. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; @@ -4638,35 +3694,19 @@ L160: L230: *fatal = TRUE_; - io___409.ciunit = *nout; - s_wsfe(&io___409); - e_wsfe(); + printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n"); + printf(" EXPECTED RESULT COMPUTED RESULT\n"); i__1 = *m; for (i__ = 1; i__ <= i__1; ++i__) { if (*mv) { - io___410.ciunit = *nout; - s_wsfe(&io___410); - do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal)); - do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof( - doublereal)); - e_wsfe(); - } else { - io___411.ciunit = *nout; - s_wsfe(&io___411); - do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer)); - do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof( - doublereal)); - do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal)); - e_wsfe(); + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i); + } else { + printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i); } /* L240: */ } if (*n > 1) { - io___412.ciunit = *nout; - s_wsfe(&io___412); - do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer)); - e_wsfe(); + printf(" THESE ARE THE RESULTS FOR COLUMN %d\n",j); } L250: @@ -4677,14 +3717,14 @@ L250: } /* zmmch_ */ -logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr) +logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr) { /* System generated locals */ integer i__1, i__2, i__3; logical ret_val; /* Local variables */ - integer i__; + static integer i__; /* Tests if two arrays are identical. */ @@ -4697,6 +3737,10 @@ logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr) /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Executable Statements .. */ /* Parameter adjustments */ --rj; --ri; @@ -4722,16 +3766,15 @@ L30: } /* lze_ */ -logical lzeres_(char *type__, char *uplo, integer *m, integer *n, - doublecomplex *aa, doublecomplex *as, integer *lda) +logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex *aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len) { /* System generated locals */ integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4; logical ret_val; /* Local variables */ - integer ibeg, iend, i__, j; - logical upper; + static integer ibeg, iend, i__, j; + static logical upper; /* Tests if selected elements in two arrays are equal. */ @@ -4746,6 +3789,10 @@ logical lzeres_(char *type__, char *uplo, integer *m, integer *n, /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Scalar Arguments .. */ +/* .. Array Arguments .. */ +/* .. Local Scalars .. */ +/* .. Executable Statements .. */ /* Parameter adjustments */ as_dim1 = *lda; as_offset = 1 + as_dim1 * 1; @@ -4803,7 +3850,7 @@ logical lzeres_(char *type__, char *uplo, integer *m, integer *n, } } -/* L60: */ +/* 60 CONTINUE */ ret_val = TRUE_; goto L80; L70: @@ -4815,7 +3862,7 @@ L80: } /* lzeres_ */ -/* Double Complex */ VOID zbeg_(doublecomplex * ret_val, logical *reset) +/* Double Complex */ VOID zbeg_(doublecomplex* ret_val, logical* reset) { /* System generated locals */ doublereal d__1, d__2; @@ -4836,6 +3883,11 @@ L80: /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Scalar Arguments .. */ +/* .. Local Scalars .. */ +/* .. Save statement .. */ +/* .. Intrinsic Functions .. */ +/* .. Executable Statements .. */ if (*reset) { /* Initialize local variables. */ mi = 891; @@ -4873,7 +3925,7 @@ L10: } /* zbeg_ */ -doublereal ddiff_(doublereal *x, doublereal *y) +doublereal ddiff_(doublereal* x, doublereal* y) { /* System generated locals */ doublereal ret_val; @@ -4887,6 +3939,8 @@ doublereal ddiff_(doublereal *x, doublereal *y) /* Jeremy Du Croz, Numerical Algorithms Group Ltd. */ /* Sven Hammarling, Numerical Algorithms Group Ltd. */ +/* .. Scalar Arguments .. */ +/* .. Executable Statements .. */ ret_val = *x - *y; return ret_val; @@ -4894,4 +3948,4 @@ doublereal ddiff_(doublereal *x, doublereal *y) } /* ddiff_ */ -/* Main program alias */ int zblat3_ () { MAIN__ (); return 0; } +/* Main program alias */ /*int zblat3_ () { MAIN__ (); }*/ From 5d929d2706f92b5fa70122b865dafe72aac6ea84 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 21:00:57 +0100 Subject: [PATCH 706/718] avoid overriding the global USE_GEMM3M --- ctest/CMakeLists.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index d7baadee4..c56a78346 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -10,11 +10,6 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") endif() -set (USE_GEMM3M 0) -if (${ARCH} MATCHES x86|x86_64|ia64|mips) - set(USE_GEMM3M 1) -endif () - if(WIN32) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 "$ErrorActionPreference = \"Stop\"\n" From 28f151808ea3fa3b39e02948d7b52616ce52cdfb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 21:01:46 +0100 Subject: [PATCH 707/718] Avoid overriding the global USE_GEMM3M --- test/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b4bf36cee..4ebd5348c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,9 +22,7 @@ if (BUILD_COMPLEX16) list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) endif() -set (USE_GEMM3M 0) -if (${ARCH} MATCHES x86|x86_64|ia64|mips) - set(USE_GEMM3M 1) +if (USE_GEMM3M) if (BUILD_COMPLEX) list (APPEND OpenBLAS_Tests cblat3_3m) endif () From 38283f678ed7683132f7b16f82fc6b13602f969a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 22:22:48 +0100 Subject: [PATCH 708/718] Fix portability problems --- utest/test_extensions/test_cgemv_t.c | 4 ++-- utest/test_extensions/test_csbmv.c | 6 +++--- utest/test_extensions/test_idamin.c | 8 +++++--- utest/test_extensions/test_isamin.c | 8 +++++--- utest/test_extensions/test_zgemv_t.c | 4 ++-- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/utest/test_extensions/test_cgemv_t.c b/utest/test_extensions/test_cgemv_t.c index aa3281e66..cb4e5ad9e 100644 --- a/utest/test_extensions/test_cgemv_t.c +++ b/utest/test_extensions/test_cgemv_t.c @@ -126,7 +126,7 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n, srand_generate(data_cgemv_t.y_test, m * inc_y * 2); // Copy vector y for reference funcs - for (int i = 0; i < m * inc_y * 2; i++) { + for (i = 0; i < m * inc_y * 2; i++) { data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i]; } @@ -1129,4 +1129,4 @@ CTEST(cgemv, c_api_xerbla_invalid_order_col_major) int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_csbmv.c b/utest/test_extensions/test_csbmv.c index 8e8ce4530..41c24a2b7 100644 --- a/utest/test_extensions/test_csbmv.c +++ b/utest/test_extensions/test_csbmv.c @@ -188,7 +188,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint char trans = 'N'; // Symmetric band packed matrix for sbmv - float a[lda * n * 2]; + float *a = (float*) malloc(lda * n * 2 * sizeof(float)); // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test srand_generate(data_csbmv.sp_matrix, n * (n + 1)); @@ -216,7 +216,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint // Find the differences between output vector caculated by csbmv and cgemv for (i = 0; i < n * inc_c * 2; i++) data_csbmv.c_test[i] -= data_csbmv.c_verify[i]; - + free(a); // Find the norm of differences return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c); } @@ -603,4 +603,4 @@ CTEST(csbmv, xerbla_lda_invalid) int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c index 6a7ed9d1e..bebe76dba 100644 --- a/utest/test_extensions/test_idamin.c +++ b/utest/test_extensions/test_idamin.c @@ -402,13 +402,14 @@ CTEST(idamin, min_idx_in_vec_tail){ CTEST(idamin, min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - double x[ELEMENTS * inc]; + double *x = (double*)malloc(ELEMENTS * inc * sizeof(double)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0f; blasint index = BLASFUNC(idamin)(&N, x, &inc); + free(x); ASSERT_EQUAL(N, index); } @@ -775,13 +776,14 @@ CTEST(idamin, c_api_min_idx_in_vec_tail){ CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - double x[ELEMENTS * inc]; + double *x = (double*) malloc(ELEMENTS * inc * sizeof(double)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0; blasint index = cblas_idamin(N, x, inc); + free(x); ASSERT_EQUAL(N - 1, index); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c index 4ff235b83..673e656df 100644 --- a/utest/test_extensions/test_isamin.c +++ b/utest/test_extensions/test_isamin.c @@ -402,13 +402,14 @@ CTEST(isamin, min_idx_in_vec_tail){ CTEST(isamin, min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - float x[ELEMENTS * inc]; + float *x = (float*) (ELEMENTS * inc * sizeof(float)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0f; blasint index = BLASFUNC(isamin)(&N, x, &inc); + free(x); ASSERT_EQUAL(N, index); } @@ -775,13 +776,14 @@ CTEST(isamin, c_api_min_idx_in_vec_tail){ CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - float x[ELEMENTS * inc]; + float *x = (float*)malloc(ELEMENTS * inc * sizeof(float)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } x[(N - 1) * inc] = 0.0f; blasint index = cblas_isamin(N, x, inc); + free(x); ASSERT_EQUAL(N - 1, index); } -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_zgemv_t.c b/utest/test_extensions/test_zgemv_t.c index 2e0ee65f0..b2d0b2713 100644 --- a/utest/test_extensions/test_zgemv_t.c +++ b/utest/test_extensions/test_zgemv_t.c @@ -126,7 +126,7 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n drand_generate(data_zgemv_t.y_test, m * inc_y * 2); // Copy vector y for reference funcs - for (int i = 0; i < m * inc_y * 2; i++) + for (i = 0; i < m * inc_y * 2; i++) { data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i]; } @@ -1133,4 +1133,4 @@ CTEST(zgemv, c_api_xerbla_invalid_order_col_major) int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif From f7ffab870b6992d14173b0439b803da04ba1ab12 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Feb 2024 23:03:10 +0100 Subject: [PATCH 709/718] fix missing malloc --- utest/test_extensions/test_isamin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c index 673e656df..d93813e6f 100644 --- a/utest/test_extensions/test_isamin.c +++ b/utest/test_extensions/test_isamin.c @@ -402,7 +402,7 @@ CTEST(isamin, min_idx_in_vec_tail){ CTEST(isamin, min_idx_in_vec_tail_inc_1){ blasint i; blasint N = ELEMENTS, inc = 1; - float *x = (float*) (ELEMENTS * inc * sizeof(float)); + float *x = (float*) malloc(ELEMENTS * inc * sizeof(float)); for (i = 0; i < N * inc; i ++) { x[i] = i + 1000; } From f81c1d4b598e16205f31f8146ad64e4069fe12f9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Feb 2024 07:19:52 +0100 Subject: [PATCH 710/718] Fix portability problem --- utest/test_extensions/test_zsbmv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utest/test_extensions/test_zsbmv.c b/utest/test_extensions/test_zsbmv.c index afdb208c1..0e79dc0d8 100644 --- a/utest/test_extensions/test_zsbmv.c +++ b/utest/test_extensions/test_zsbmv.c @@ -188,7 +188,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin char trans = 'N'; // Symmetric band packed matrix for sbmv - double a[lda * n * 2]; + double *a = (double*) malloc(lda * n * 2 * sizeof(double)); // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test drand_generate(data_zsbmv.sp_matrix, n * (n + 1)); @@ -213,6 +213,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c); + free(a); // Find the differences between output vector caculated by zsbmv and zgemv for (i = 0; i < n * inc_c * 2; i++) data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i]; @@ -603,4 +604,4 @@ CTEST(zsbmv, xerbla_lda_invalid) int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); ASSERT_EQUAL(TRUE, passed); } -#endif \ No newline at end of file +#endif From 8e872a91a946c4fe646b9bb85732f92e3ae6424e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Feb 2024 11:28:50 +0100 Subject: [PATCH 711/718] Fix erroneous mapping of SUM kernels to ASUM --- kernel/zarch/KERNEL.Z13 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index 3bcc32197..fe82d81e6 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -35,10 +35,10 @@ DASUMKERNEL = dasum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = zasum.c -SSUMKERNEL = ../arm/asum.c -DSUMKERNEL = dasum.c -CSUMKERNEL = ../arm/zasum.c -ZSUMKERNEL = zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = dsum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = zsum.c SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c From 4266b393046f2b12c9f2b5724889245380c0293d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Feb 2024 17:52:45 +0100 Subject: [PATCH 712/718] Make building the benchmarks optional and handle dependency on other options --- CMakeLists.txt | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index af3050cf9..30adf3a66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) +option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) + option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) @@ -458,9 +460,23 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") endif() endif() -if (BUILD_TESTING) - find_package(OpenMP REQUIRED) +if (BUILD_BENCHMARKS) + #find_package(OpenMP REQUIRED) file(GLOB SOURCES "benchmark/*.c") + if (NOT USE_OPENMP) + file(GLOB REMFILE "benchmark/smallscaling.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() + if (BUILD_WITHOUT_LAPACK) + file(GLOB REMFILE "benchmark/cholesky.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/geev.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() + if (NOT USE_GEMM3M) + file(GLOB REMFILE "benchmark/gemm3m.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + endif() foreach(source ${SOURCES}) get_filename_component(name ${source} NAME_WE) if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper")) @@ -477,7 +493,8 @@ if (BUILD_TESTING) (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE")) add_executable(${target_name} ${source}) target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) - target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) + target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} ) + # target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) if (NOT "${define}" STREQUAL "DEFAULT") target_compile_definitions(${target_name} PRIVATE ${define}) endif() From cfc28c586ebcc840623d79560710a8485eea9ec2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Feb 2024 19:55:12 +0100 Subject: [PATCH 713/718] Exclude LAPACK testsuite and LAPACK-dependent benchmarks in no-LAPACK builds --- CMakeLists.txt | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 30adf3a66..9fbe878e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -330,7 +330,7 @@ if (NOT NOFORTRAN) # Build test and ctest add_subdirectory(test) endif() - if (BUILD_TESTING) + if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK) add_subdirectory(lapack-netlib/TESTING) endif() endif() @@ -472,6 +472,18 @@ if (BUILD_BENCHMARKS) list(REMOVE_ITEM SOURCES ${REMFILE}) file(GLOB REMFILE "benchmark/geev.c") list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/gesv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/getri.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/potrf.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/spmv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/symv.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) + file(GLOB REMFILE "benchmark/linpack.c") + list(REMOVE_ITEM SOURCES ${REMFILE}) endif() if (NOT USE_GEMM3M) file(GLOB REMFILE "benchmark/gemm3m.c") From d1409407a090a46512e1ecddc6633943ccef99be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Feb 2024 21:05:59 +0100 Subject: [PATCH 714/718] Omit redundant prefixes or suffixes in library naming --- Makefile.system | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index cb5453cac..aadf3459a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1520,10 +1520,18 @@ ifndef LIBNAMEPREFIX LIBNAMEPREFIX = endif +SYMPREFIX=$(SYMBOLPREFIX) +ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX)) +SYMPREFIX= +endif +SYMSUFFIX=$(SYMBOLSUFFIX) +ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX)) +SYMSUFFIX= +endif ifndef LIBNAMESUFFIX -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) +LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX) else -LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)$(LIBNAMESUFFIX) +LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX) endif ifeq ($(OSNAME), CYGWIN_NT) From c508a10cf2413fc5381376c4586303e0fa7ffbd0 Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 26 Feb 2024 20:09:42 -0500 Subject: [PATCH 715/718] LoongArch64: Add cgemv LSX opt --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/cgemv_n_4_lsx.S | 323 +++++++++++++++++++++++ kernel/loongarch64/cgemv_t_4_lsx.S | 290 ++++++++++++++++++++ 3 files changed, 616 insertions(+) create mode 100644 kernel/loongarch64/cgemv_n_4_lsx.S create mode 100644 kernel/loongarch64/cgemv_t_4_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c7ef44035..befe68451 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -100,6 +100,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CGEMVNKERNEL = cgemv_n_4_lsx.S +CGEMVTKERNEL = cgemv_t_4_lsx.S + CGEMMKERNEL = cgemm_kernel_8x4_lsx.S CGEMMINCOPY = cgemm_ncopy_8_lsx.S CGEMMITCOPY = cgemm_tcopy_8_lsx.S diff --git a/kernel/loongarch64/cgemv_n_4_lsx.S b/kernel/loongarch64/cgemv_n_4_lsx.S new file mode 100644 index 000000000..cf8273797 --- /dev/null +++ b/kernel/loongarch64/cgemv_n_4_lsx.S @@ -0,0 +1,323 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $vr1 +#define X0 $vr2 +#define X1 $vr3 +#define X2 $vr4 +#define X3 $vr5 +#define X4 $vr6 +#define X5 $vr7 +#define X6 $vr8 +#define X7 $vr9 +#define Y0 $vr10 +#define Y1 $vr11 +#define A0 $vr12 +#define A1 $vr13 +#define A2 $vr14 +#define A3 $vr15 +#define A4 $vr16 +#define A5 $vr17 +#define A6 $vr18 +#define A7 $vr19 +#define A8 $vr20 +#define A9 $vr21 +#define A10 $vr22 +#define A11 $vr23 +#define A12 $vr24 +#define A13 $vr25 +#define A14 $vr26 +#define A15 $vr27 +#define TMP0 $vr28 +#define TMP1 $vr29 +#define TMP2 $vr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro CLOAD_X_4 + GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_4_GAP + vldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + vldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + vldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + vldrepl.d X3, T0, 0x00 + + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_X_1 + GLDREPL v, d, X0, X, 0x00 + GCOMPLEXMUL GXCONJ, \ + vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 +.endm + +.macro CLOAD_Y_4 + GLD v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro CLOAD_Y_4_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + vpackev.d Y0, A1, Y0 + vpackev.d Y1, A5, Y1 +.endm + +.macro CLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro CSTORE_Y_4 + GST v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro CSTORE_Y_4_GAP + vstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + vstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + vstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + vstelm.d Y1, T0, 0, 1 +.endm + +.macro CSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +.macro CGEMV_N_4x4 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_1x4 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ + Y0, X3, A6, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L4: + CLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + CLOAD_\Y_4 + CGEMV_N_4x4 + CSTORE_\Y_4 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x4 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END +.L_\XW\()_N_L1: + CLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + CLOAD_\Y_1 + CGEMV_N_1x1 + CSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + vpackev.w $vr0, $vr1, $vr0 + vpackev.d VALPHA, $vr0, $vr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cgemv_t_4_lsx.S b/kernel/loongarch64/cgemv_t_4_lsx.S new file mode 100644 index 000000000..ada349364 --- /dev/null +++ b/kernel/loongarch64/cgemv_t_4_lsx.S @@ -0,0 +1,290 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $vr0 +#define X0 $vr1 +#define X1 $vr2 +#define A0 $vr3 +#define A1 $vr4 +#define A2 $vr5 +#define A3 $vr6 +#define A4 $vr7 +#define A5 $vr8 +#define A6 $vr9 +#define A7 $vr10 +#define A8 $vr11 +#define A9 $vr12 +#define A10 $vr13 +#define A11 $vr14 +#define A12 $vr15 +#define A13 $vr16 +#define A14 $vr17 +#define A15 $vr18 +#define TP0 $vr19 +#define TP1 $vr20 +#define TP2 $vr21 +#define TP3 $vr22 +#define TP4 $vr23 +#define TP5 $vr24 +#define TP6 $vr25 +#define TP7 $vr26 +#define TMP0 $vr27 +#define TMP1 $vr28 +#define TMP2 $vr29 +#define Y0 $vr3 +#define Y1 $vr4 +#define Y2 $vr5 +#define Y3 $vr6 +#define Y4 $vr7 +#define Y5 $vr8 +#define Y6 $vr9 +#define Y7 $vr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y4 + GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y1 + GXOR v, v, TP0, TP0, TP0 +.endm + +.macro CLOAD_X4 + GLD v, , X0, X, 0x00, X1, X, 0x10 +.endm + +.macro CLOAD_X4_GAP + fld.d $f1, X, 0x00 + fldx.d $f3, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f2, T0, 0x00 + fldx.d $f4, T0, INC_X + vpackev.d X0, A0, X0 + vpackev.d X1, A1, X1 +.endm + +.macro CGEMV_T_4x4 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ + TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ + TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 +.endm + +.macro CGEMV_T_LSX XW:req, X4:req + PTR_SRLI J, N, 2 + beqz J, .L_\XW\()_N_3 + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L4: + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 2 + beqz I, .L_\XW\()_M_3 +.align 5 +.L_\XW\()_M_L4: + CLOAD_\X4 + CGEMV_T_4x4 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 2 + bnez I, .L_\XW\()_M_L4 +.L_\XW\()_M_3: + // Accumulated + GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#else + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ + A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ + A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 + bnez J, .L_\XW\()_N_L4 +.L_\XW\()_N_3: + andi J, N, 3 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M8 +.L_\XW\()_N_1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + fld.d $f3, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + fst.d $f3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + // Init VALPHA + vpackev.w $vr0, $vr1, $vr0 + vpackev.d VALPHA, $vr0, $vr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#else + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + CGEMV_T_LSX GAP_0, X4 +.L_GAP_1: /* if (incx != 1) */ + CGEMV_T_LSX GAP_1, X4_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE From 3f22fc22333ae7b739e0620d64710e3905ef1dc1 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 27 Feb 2024 04:16:49 -0500 Subject: [PATCH 716/718] LoongArch64: Add zgemv LSX opt --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 3 + kernel/loongarch64/loongarch64_asm.S | 3 + kernel/loongarch64/zgemv_n_2_lsx.S | 296 +++++++++++++++++++++++ kernel/loongarch64/zgemv_t_2_lsx.S | 268 ++++++++++++++++++++ 4 files changed, 570 insertions(+) create mode 100644 kernel/loongarch64/zgemv_n_2_lsx.S create mode 100644 kernel/loongarch64/zgemv_t_2_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index befe68451..15d4358be 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -118,6 +118,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZGEMVNKERNEL = zgemv_n_2_lsx.S +ZGEMVTKERNEL = zgemv_t_2_lsx.S + ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S ZGEMMONCOPY = zgemm_ncopy_4_lsx.S ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S index fee46d63e..d097b3045 100644 --- a/kernel/loongarch64/loongarch64_asm.S +++ b/kernel/loongarch64/loongarch64_asm.S @@ -406,9 +406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .ifeqs "\suf_op", "s" vpackod.d \out, \in, \in \pre_op\()add.\suf_op \out, \out, \in +.else + vor.v \out, \in, \in .endif .endif + .ifnb \more GCOMPLEXACC \pre_op, \suf_op, \more .endif diff --git a/kernel/loongarch64/zgemv_n_2_lsx.S b/kernel/loongarch64/zgemv_n_2_lsx.S new file mode 100644 index 000000000..efb376118 --- /dev/null +++ b/kernel/loongarch64/zgemv_n_2_lsx.S @@ -0,0 +1,296 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M16 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $vr1 +#define X0 $vr2 +#define X1 $vr3 +#define X2 $vr4 +#define X3 $vr5 +#define X4 $vr6 +#define X5 $vr7 +#define X6 $vr8 +#define X7 $vr9 +#define Y0 $vr10 +#define Y1 $vr11 +#define A0 $vr12 +#define A1 $vr13 +#define A2 $vr14 +#define A3 $vr15 +#define A4 $vr16 +#define A5 $vr17 +#define A6 $vr18 +#define A7 $vr19 +#define A8 $vr20 +#define A9 $vr21 +#define A10 $vr22 +#define A11 $vr23 +#define A12 $vr24 +#define A13 $vr25 +#define A14 $vr26 +#define A15 $vr27 +#define TMP0 $vr28 +#define TMP1 $vr29 +#define TMP2 $vr30 + +#if !defined(CONJ) +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 0 +#else +#define GXCONJ 1 +#define GCONJ 0 +#endif +#else +#if !defined(XCONJ) +#define GXCONJ 0 +#define GCONJ 1 +#else +#define GXCONJ 1 +#define GCONJ 1 +#endif +#endif + +.macro ZLOAD_X_2 + GLD v, , X0, X, 0x00, X1, X, 0x10 + GCOMPLEXMUL GXCONJ, \ + vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_X_2_GAP + vld X0, X, 0 + PTR_ADD T0, X, INC_X + vld X1, T0, 0 + + GCOMPLEXMUL GXCONJ, \ + vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_X_1 + GLD v, , X0, X, 0x00 + GCOMPLEXMUL GXCONJ, \ + vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 +.endm + +.macro ZLOAD_Y_2 + GLD v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro ZLOAD_Y_2_GAP + vld $vr10, Y, 0 + vldx $vr11, Y, INC_Y +.endm + +.macro ZLOAD_Y_1 + vld $vr10, Y, 0 +.endm + +.macro ZGEMV_N_2x2 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_1x2 + GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ + Y0, X1, A2, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_N_1x1 + GLD_INC v, , 0x10, $vr12, PA0, 0 + GCOMPLEXMADD GXCONJ, GCONJ, \ + vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 +.endm + +.macro ZSTORE_Y_2 + GST v, , Y0, Y, 0, Y1, Y, 0x10 +.endm + +.macro ZSTORE_Y_2_GAP + vst Y0, Y, 0 + vstx Y1, Y, INC_Y +.endm + +.macro ZSTORE_Y_1 + vst $vr10, Y, 0 +.endm + +.macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req + PTR_SRLI J, N, 1 + beqz J, .L_\XW\()_N_1 + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L2: + ZLOAD_\X_2 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 1 + beqz I, .L_\XW\()_M_1 +.align 5 +.L_\XW\()_M_L2: + ZLOAD_\Y_2 + ZGEMV_N_2x2 + ZSTORE_\Y_2 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 1 + PTR_ADDI K, K, 4 + bnez I, .L_\XW\()_M_L2 +.L_\XW\()_M_1: + andi I, M, 1 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x2 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + PTR_ALSL X, INC_X, X, 1 + bnez J, .L_\XW\()_N_L2 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END +.L_\XW\()_N_L1: + ZLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + ZLOAD_\Y_1 + ZGEMV_N_1x1 + ZSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + PTR_SUB K_LDA, LDA, M16 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD X, X, INC_X + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 31 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + vpackev.d VALPHA, $vr1, $vr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA +#else + GADD , d, PA1, PA0, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 // Obtain the offset address + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 31 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/zgemv_t_2_lsx.S b/kernel/loongarch64/zgemv_t_2_lsx.S new file mode 100644 index 000000000..2a0fc172e --- /dev/null +++ b/kernel/loongarch64/zgemv_t_2_lsx.S @@ -0,0 +1,268 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M16 $r30 + +#define VALPHA $vr0 +#define X0 $vr1 +#define X1 $vr2 +#define A0 $vr3 +#define A1 $vr4 +#define A2 $vr5 +#define A3 $vr6 +#define A4 $vr7 +#define A5 $vr8 +#define A6 $vr9 +#define A7 $vr10 +#define A8 $vr11 +#define A9 $vr12 +#define A10 $vr13 +#define A11 $vr14 +#define A12 $vr15 +#define A13 $vr16 +#define A14 $vr17 +#define A15 $vr18 +#define TP0 $vr19 +#define TP1 $vr20 +#define TP2 $vr21 +#define TP3 $vr22 +#define TP4 $vr23 +#define TP5 $vr24 +#define TP6 $vr25 +#define TP7 $vr26 +#define TMP0 $vr27 +#define TMP1 $vr28 +#define TMP2 $vr29 +#define Y0 $vr3 +#define Y1 $vr4 +#define Y2 $vr5 +#define Y3 $vr6 +#define Y4 $vr7 +#define Y5 $vr8 +#define Y6 $vr9 +#define Y7 $vr10 + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define GXCONJ1 0 +#define GCONJ1 0 +#else +#define GXCONJ1 1 +#define GCONJ1 0 +#endif + +#if !defined(XCONJ) +#define GXCONJ2 0 +#define GCONJ2 0 +#else +#define GXCONJ2 0 +#define GCONJ2 1 +#endif + +.macro ZERO_Y2 + GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1 +.endm + +.macro ZERO_Y1 + GXOR v, v, TP0, TP0, TP0 +.endm + +.macro ZLOAD_X2 + GLD v, , X0, X, 0x00, X1, X, 0x10 +.endm + +.macro ZLOAD_X2_GAP + vld X0, X, 0 + vldx X1, X, INC_X +.endm + +.macro ZGEMV_T_2x2 + GLD_INC v, , 0x10, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ + TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2 +.endm + +.macro ZGEMV_T_LSX XW:req, X2:req + PTR_SRLI J, N, 1 + beqz J, .L_\XW\()_N_1 + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M16 +.L_\XW\()_N_L2: + ZERO_Y2 + move X, X_ORG + PTR_SRLI I, M, 1 + beqz I, .L_\XW\()_M_1 +.align 5 +.L_\XW\()_M_L2: + ZLOAD_\X2 + ZGEMV_T_2x2 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 1 + bnez I, .L_\XW\()_M_L2 +.L_\XW\()_M_1: + // Accumulated + GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1 + andi I, M, 1 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 +#elif __loongarch_grlen == 32 + GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10 +#else + GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 +#endif + + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2 + + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + vld A8, Y, 0x00 + vldx A9, Y, INC_Y + + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#elif __loongarch_grlen == 32 + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + vst $vr11, Y, 0x00 + vstx $vr12, Y, INC_Y + PTR_ALSL Y, INC_Y, Y, 1 + bnez J, .L_\XW\()_N_L2 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + PTR_SUB K_LDA, LDA, M16 +.L_\XW\()_N_L1: + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + GLD v, , A0, PA0, 0x00, X0, X, 0x00 + GCOMPLEXMADD GXCONJ1, GCONJ1, \ + vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x10 + bnez I, .L_\XW\()_N_1_M_L1 +.L_\XW\()_N_1_M_END: + PTR_ADDI J, J, -1 + vld A0, Y, 0x00 + GCOMPLEXMADD GXCONJ2, GCONJ2, \ + vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 + vst $vr3, Y, 0x00 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD Y, Y, INC_Y + bnez J, .L_\XW\()_N_L1 + + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 30 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 + // Init VALPHA + vpackev.d VALPHA, $vr1, $vr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA +#elif __loongarch_grlen == 32 + GADD , w, PA1, PA0, LDA +#else + GADD , d, PA1, PA0, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + ZGEMV_T_LSX GAP_0, X2 +.L_GAP_1: /* if (incx != 1) */ + ZGEMV_T_LSX GAP_1, X2_GAP +.L_END: + pop_if_used 17 + 8, 30 + jirl $r0, $r1, 0x0 + EPILOGUE From 8e05c053be8b889697b17f6847d94a8e4ce3c12b Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 27 Feb 2024 21:17:01 -0500 Subject: [PATCH 717/718] LoongArch64:Fixed the failed test cases test_{c/z}gemv_n in test_extensions --- kernel/loongarch64/cgemv_n_8_lasx.S | 34 ++++++++++++++--------------- kernel/loongarch64/zgemv_n_4_lasx.S | 18 +++++++-------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S index b078e3227..ba38a9573 100644 --- a/kernel/loongarch64/cgemv_n_8_lasx.S +++ b/kernel/loongarch64/cgemv_n_8_lasx.S @@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 GCOMPLEXMUL GXCONJ, \ - xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ - X1, X1, VALPHA, TMP0, TMP1, TMP2, \ - X2, X2, VALPHA, TMP0, TMP1, TMP2, \ - X3, X3, VALPHA, TMP0, TMP1, TMP2, \ - X4, X4, VALPHA, TMP0, TMP1, TMP2, \ - X5, X5, VALPHA, TMP0, TMP1, TMP2, \ - X6, X6, VALPHA, TMP0, TMP1, TMP2, \ - X7, X7, VALPHA, TMP0, TMP1, TMP2 + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2, \ + X4, VALPHA, X4, TMP0, TMP1, TMP2, \ + X5, VALPHA, X5, TMP0, TMP1, TMP2, \ + X6, VALPHA, X6, TMP0, TMP1, TMP2, \ + X7, VALPHA, X7, TMP0, TMP1, TMP2 .endm .macro CLOAD_X_8_GAP @@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d X7, T0, 0x00 GCOMPLEXMUL GXCONJ, \ - xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ - X1, X1, VALPHA, TMP0, TMP1, TMP2, \ - X2, X2, VALPHA, TMP0, TMP1, TMP2, \ - X3, X3, VALPHA, TMP0, TMP1, TMP2, \ - X4, X4, VALPHA, TMP0, TMP1, TMP2, \ - X5, X5, VALPHA, TMP0, TMP1, TMP2, \ - X6, X6, VALPHA, TMP0, TMP1, TMP2, \ - X7, X7, VALPHA, TMP0, TMP1, TMP2 + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2, \ + X4, VALPHA, X4, TMP0, TMP1, TMP2, \ + X5, VALPHA, X5, TMP0, TMP1, TMP2, \ + X6, VALPHA, X6, TMP0, TMP1, TMP2, \ + X7, VALPHA, X7, TMP0, TMP1, TMP2 .endm .macro CLOAD_Y_8 @@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro CLOAD_X_1 GLDREPL xv, d, X0, X, 0x00 GCOMPLEXMUL GXCONJ, \ - xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2 + xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 .endm .macro CLOAD_Y_1 diff --git a/kernel/loongarch64/zgemv_n_4_lasx.S b/kernel/loongarch64/zgemv_n_4_lasx.S index 98b1a6f7d..26edf1ed7 100644 --- a/kernel/loongarch64/zgemv_n_4_lasx.S +++ b/kernel/loongarch64/zgemv_n_4_lasx.S @@ -122,10 +122,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30 GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0 GCOMPLEXMUL GXCONJ, \ - xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ - X1, X1, VALPHA, TMP0, TMP1, TMP2, \ - X2, X2, VALPHA, TMP0, TMP1, TMP2, \ - X3, X3, VALPHA, TMP0, TMP1, TMP2 + xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 .endm .macro ZLOAD_X_4_GAP @@ -145,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvpermi.q X3, X3, 0 GCOMPLEXMUL GXCONJ, \ - xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ - X1, X1, VALPHA, TMP0, TMP1, TMP2, \ - X2, X2, VALPHA, TMP0, TMP1, TMP2, \ - X3, X3, VALPHA, TMP0, TMP1, TMP2 + xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ + X1, VALPHA, X1, TMP0, TMP1, TMP2, \ + X2, VALPHA, X2, TMP0, TMP1, TMP2, \ + X3, VALPHA, X3, TMP0, TMP1, TMP2 .endm .macro ZLOAD_Y_4 @@ -216,7 +216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GLD xv, , X0, X, 0x00 GPERMI xv, q, X0, X0, 0 GCOMPLEXMUL GXCONJ, \ - xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2 + xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 .endm .macro ZGEMV_N_1x1 From b2db064285bfaa36ac44b3a61cb4930cf0076d8d Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 27 Feb 2024 10:47:49 +0800 Subject: [PATCH 718/718] Optimized sgemv and dgemv kernel LSX for LoongArch --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 12 + kernel/loongarch64/dgemv_n_lsx.S | 229 +++++++++++++++++++ kernel/loongarch64/dgemv_t_lsx.S | 279 +++++++++++++++++++++++ kernel/loongarch64/sgemv_n_lsx.S | 227 ++++++++++++++++++ kernel/loongarch64/sgemv_t_lsx.S | 275 ++++++++++++++++++++++ 5 files changed, 1022 insertions(+) create mode 100644 kernel/loongarch64/dgemv_n_lsx.S create mode 100644 kernel/loongarch64/dgemv_t_lsx.S create mode 100644 kernel/loongarch64/sgemv_n_lsx.S create mode 100644 kernel/loongarch64/sgemv_t_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c7ef44035..5b54a2ada 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -85,6 +85,12 @@ ZSWAPKERNEL = cswap_lsx.S CSUMKERNEL = csum_lsx.S ZSUMKERNEL = csum_lsx.S +SGEMVNKERNEL = sgemv_n_lsx.S +SGEMVTKERNEL = sgemv_t_lsx.S + +DGEMVNKERNEL = dgemv_n_lsx.S +DGEMVTKERNEL = dgemv_t_lsx.S + DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S DGEMMITCOPY = dgemm_tcopy_8_lsx.S @@ -100,6 +106,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CGEMVNKERNEL = cgemv_n_4_lsx.S +CGEMVTKERNEL = cgemv_t_4_lsx.S + CGEMMKERNEL = cgemm_kernel_8x4_lsx.S CGEMMINCOPY = cgemm_ncopy_8_lsx.S CGEMMITCOPY = cgemm_tcopy_8_lsx.S @@ -115,6 +124,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZGEMVNKERNEL = zgemv_n_2_lsx.S +ZGEMVTKERNEL = zgemv_t_2_lsx.S + ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S ZGEMMONCOPY = zgemm_ncopy_4_lsx.S ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S diff --git a/kernel/loongarch64/dgemv_n_lsx.S b/kernel/loongarch64/dgemv_n_lsx.S new file mode 100644 index 000000000..9a0141fb1 --- /dev/null +++ b/kernel/loongarch64/dgemv_n_lsx.S @@ -0,0 +1,229 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.d VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IX, $r0 + + move AO1, A //a_ptr + move XX, X + move YY, Y + + beq J, M, .L999 + +.L01: + vldx U0, XX, IX + vshuf4i.d U0, U0, 0x00 + + vfmul.d U1, VALPHA, U0 //temp1 + + move IY, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, M, 2 //n/4 + beq I, T0, .L03 + +.L02: + vldx U2, AO1, II + addi.d II, II, 16 + vldx U7, AO1, II + + move T1, IY + add.d T2, T1, INCY + add.d T3, T2, INCY + add.d T4, T3, INCY + + fldx.d a1, YY, T1 + fldx.d a2, YY, T2 + fldx.d a3, YY, T3 + fldx.d a4, YY, T4 + + vextrins.d U3, U4, 0x10 + vextrins.d U5, U6, 0x10 + + vfmadd.d U3, U1, U2, U3 + vfmadd.d U5, U1, U7, U5 + + vextrins.d U4, U3, 0x01 + vextrins.d U6, U5, 0x01 + + fstx.d a1, YY, T1 + fstx.d a2, YY, T2 + fstx.d a3, YY, T3 + fstx.d a4, YY, T4 + + add.d IY, T4, INCY + addi.d II, II, 16 + addi.d I, I, 1 + blt I, T0, .L02 + +.L03: + andi T0, M, 2 + beq $r0, T0, .L04 + + addi.d T1, $r0, 4 + mod.d T1, M, T1 + sub.d II, M, T1 + slli.d II, II, BASE_SHIFT + + move T1, IY + add.d T2, T1, INCY + + vldx U2, AO1, II + + fldx.d a1, YY, T1 + fldx.d a2, YY, T2 + + vextrins.d U3, U4, 0x10 + + vfmadd.d U3, U1, U2, U3 + + vextrins.d U4, U3, 0x01 + + fstx.d a1, YY, T1 + fstx.d a2, YY, T2 + + add.d IY, T2, INCY + +.L04: + andi T0, M, 1 + beq $r0, T0, .L05 + + addi.d II, M, -1 + slli.d II, II, BASE_SHIFT + + fldx.d a1, AO1, II + fldx.d a3, YY, IY + + fmadd.d a3, $f12, a1, a3 + + fstx.d a3, YY, IY + + add.d IY, IY, INCY + +.L05: + add.d AO1, AO1, LDA + add.d IX, IX, INCX + + addi.d J, J, 1 + blt J, N, .L01 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 32 + LDARG $r27, $sp, 40 + LDARG $r28, $sp, 48 + LDARG $r29, $sp, 56 + LDARG $r30, $sp, 64 + LD ALPHA, $sp, 72 + addi.d $sp, $sp, 80 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_lsx.S b/kernel/loongarch64/dgemv_t_lsx.S new file mode 100644 index 000000000..76f0d9bdc --- /dev/null +++ b/kernel/loongarch64/dgemv_t_lsx.S @@ -0,0 +1,279 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define AO3 $r12 +#define AO4 $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.d VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IY, $r0 + + move AO1, A //a_ptr1 + + srai.d T0, N, 2 //n/4 + beq J, T0, .L04 + +.L01: /* j